# Importing Libraries & creating TFIDF object

In [124]:
import pandas as pd
import re
import numpy as np
import string
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

comments = pd.read_csv("mergedDataSet.csv", encoding='ISO-8859-1',usecols=['comment_text','merged_rating'],index_col=False)

v = TfidfVectorizer(use_idf=True, max_df=0.7, lowercase=True, stop_words="english", strip_accents="unicode",
                    token_pattern=r"(?u)\b\w*[a-zA-Z]\w*\b", ngram_range=(1, 2))



v.fit(comments['comment_text'])

x_tfidf = v.transform(comments["comment_text"])

y = comments["merged_rating"]
clf = MultinomialNB()

scores = cross_val_score(clf, x_tfidf, y, cv=10)

# SMOTE

In [112]:
os = SMOTE(random_state=0)

X_train, X_test, y_train, y_test = train_test_split(x_tfidf, y, test_size=0.3, random_state=42)
os_data_X,os_data_y=os.fit_sample(X_train,y_train)

clf.fit(os_data_X,os_data_y)

y_pred = clf.predict(X_test)
metrics.precision_recall_fscore_support(y_test,y_pred,average='macro')

(0.59982666618798508, 0.71255058821512185, 0.63120257455454609, None)

In [117]:
fpr,tpr,thresholds = metrics.roc_curve(y_test,y_pred,pos_label=3)




array([ 0.        ,  0.06103559,  0.17928816,  1.        ])

# Grid Search for Naive bayes/TFIDF

In [128]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stop_words,lowercase=True,token_pattern=r"(?u)\b\w*[a-zA-Z]\w*\b")),
    ('clf', OneVsRestClassifier(MultinomialNB(
        fit_prior=True, class_prior=None))),
])
parameters = {
    'tfidf__max_df': (0.5, 0.75),
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'clf__estimator__alpha': (1e-2, 1e-3)
}

grid_search_tune = GridSearchCV(pipeline, parameters, cv=5, n_jobs=2, verbose=3)
grid_search_tune.fit(comments['comment_text'].values, y)

print("Best parameters set:")
print (grid_search_tune.best_estimator_.steps)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed: 25.4min
[Parallel(n_jobs=2)]: Done  60 out of  60 | elapsed: 44.5min finished


Best parameters set:
[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs',... 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"],
        strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w*[a-zA-Z]\\w*\\b', tokenizer=None,
        use_idf=True, vocabulary=None)), ('clf', OneVsRestClassifier(estimator=MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True),
          n_jobs=

# Random Forest

In [145]:
random_forest = RandomForestClassifier(n_estimators=10)
random_forest.fit(os_data_X, os_data_y)
Y_pred = random_forest.predict(X_test)
random_forest.score(y_test,Y_pred)