In [14]:
import nltk
import numpy as np
import pandas as pd
import string
import re
import matplotlib.pyplot as plt

import warnings
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


# Load Dataset
pd.set_option('display.max_colwidth', 100)
data = pd.read_csv('./data/movie_data.csv', sep = ',')

data = data.sample(frac = 1)
data = data[:1000]

In [15]:
#Processing
stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.WordNetLemmatizer()

def clean_text(text):
    # Remove Punctuation
    text = ''.join([word.lower() for word in text if word not in string.punctuation])
    # make Token
    tokens = re.split('\W+', text)
    # Remove StopWords and make Stem
    text = [ps.lemmatize(word) for word in tokens if word not in stopwords]
    return text

In [16]:
# Vectorization -- TF-IDF 
TFIDF_Vect = TfidfVectorizer(analyzer = clean_text)
X_TFIDF = TFIDF_Vect.fit_transform(data['review'])
X_TFIDF_pd = pd.DataFrame(X_TFIDF.toarray(), columns = TFIDF_Vect.get_feature_names())



In [None]:
warnings.filterwarnings('ignore', category = DeprecationWarning)

rf = RandomForestClassifier()
param = {'n_estimators' : [10, 150, 300],
         'max_depth' : [30, 60, 90, None]}

gs = GridSearchCV(rf, param, cv = 5, n_jobs = 4)
gs_fit = gs.fit(X_TFIDF_pd, data['sentiment'])
result1 = pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending = False)

In [5]:
svc = SVC(kernel = 'rbf' , probability=True)
param = {'gamma' : [0.1, 0.5, 1],
         'C' : [1, 5, 10]}

gs2 = GridSearchCV(svc, param, cv = 5, n_jobs = 4)
gs_fit2 = gs2.fit(X_TFIDF_pd, data['sentiment'])
result2 = pd.DataFrame(gs_fit2.cv_results_).sort_values('mean_test_score', ascending = False)

   mean_fit_time  std_fit_time  mean_score_time  std_score_time param_C  \
2      56.524472      5.195961         5.756372        0.701491       5   
3      59.492558      0.261674         6.012214        0.339322       5   
1      61.335530      0.805405         6.645065        0.209664       1   
0      57.665804      2.701728         5.599381        1.200460       1   

  param_gamma                  params  split0_test_score  split1_test_score  \
2         0.1  {'C': 5, 'gamma': 0.1}           0.790419           0.790419   
3         0.5  {'C': 5, 'gamma': 0.5}           0.796407           0.802395   
1         0.5  {'C': 1, 'gamma': 0.5}           0.772455           0.772455   
0         0.1  {'C': 1, 'gamma': 0.1}           0.502994           0.502994   

   split2_test_score  mean_test_score  std_test_score  rank_test_score  
2           0.807229         0.796022        0.007924                1  
3           0.789157         0.795986        0.005413                2  
1        

In [6]:
gbc = GradientBoostingClassifier()
param = {'n_estimators' : [100, 300, 500]}

gs3 = GridSearchCV(gbc, param, cv = 5, n_jobs = 4)
gs_fit3 = gs3.fit(X_TFIDF_pd, data['sentiment'])
result3 = pd.DataFrame(gs_fit3.cv_results_).sort_values('mean_test_score', ascending = False)

   mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
1      29.635545      0.327500           0.1215        0.004501   
0       2.683498      0.025499           0.1470        0.012998   

  param_n_estimators                 params  split0_test_score  \
1                150  {'n_estimators': 150}              0.680   
0                 10   {'n_estimators': 10}              0.652   

   split1_test_score  mean_test_score  std_test_score  rank_test_score  
1              0.672            0.676           0.004                1  
0              0.676            0.664           0.012                2  
