In [1]:
import re 
import nltk
import spacy
import pickle
import uvicorn
import pandas as pd
from sklearn.svm import SVC    
from fastapi import FastAPI
from pydantic import BaseModel
from xgboost import XGBClassifier
from nltk.corpus import stopwords   
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier       
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
nltk.download('stopwords')

  from .autonotebook import tqdm as notebook_tqdm
Using TensorFlow backend.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\darpa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
class model:
    def read_dataset(data):
        dataset = pd.read_csv(data)
        dataset.drop('Unnamed: 0', axis = 1, inplace = True)
        dataset.columns = ['sentiment', 'review']
        dataset.review = dataset.review.str.split(" ",1).str[1]
        encoded_dict = {'negative': 0, 'positive': 1}
        dataset['sentiment'] = dataset.sentiment.map(encoded_dict)
        return dataset


    def clean_dataset(column):
        stop_words = stopwords.words('english')
        stop_words.remove('not')
        stop_words.remove('or')
        for row in column:
        # Split CamelCase words
            row = re.sub('([A-Z][a-z]+)', r' \1', re.sub('([A-Z]+)', r' \1', str(row))).split()
        # Remove special characters and numbers 
            row = re.sub('[^a-zA-Z]', ' ', str(row))
        # Remove Repeated words
            row = re.sub(r"\\b(\\w+)(?:\\W+\\1\\b)+", "", str(row))
        # Replace tabs and newlines with a single space
            row = re.sub("(\\t)", " ", str(row))
            row = re.sub("(\\r)", " ", str(row))
            row = re.sub("(\\n)", " ", str(row))
        # Remove single alphabets
            row = re.sub(r'(?:^| )\w(?:$| )', ' ', str(row)).strip()
            row = row.split()
            row = [word for word in row if not word in set(stop_words)]
            row = ' '.join(row)
            row = row.lower()
            yield row

    
    def reviews(data):
        dataset = model.read_dataset(data)
        reviews = model.clean_dataset(dataset.review)
        nlp = spacy.load("en_core_web_sm", disable=['ner', 'parser'])
        reviews = [str(doc) for doc in nlp.pipe(reviews, batch_size = 128)]
        cv = CountVectorizer(max_features = 1500)
        X = cv.fit_transform(reviews).toarray()
        y = dataset.sentiment
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
        return cv, X_train, X_test, y_train, y_test


    def logit(X_train, y_train):
        parameters = [{'penalty':['l2','none'], 'solver':['newton-cg','lbfgs','saga'], 'C':[10,1.0,0.1,0.01]}]

        classifier = GridSearchCV(estimator = LogisticRegression(random_state = 0),
                           param_grid = parameters, scoring = 'accuracy',
                           cv = 10, n_jobs = -1)

        classifier.fit(X_train, y_train)
        best_accuracy = classifier.best_score_
        best_parameters = classifier.best_params_
        return classifier, best_accuracy, best_parameters


    def tree(X_train, y_train):
        parameters = [{'criterion':['gini','entropy'], 'splitter':['best','random'], 'max_features':['sqrt','log2']}]

        classifier = GridSearchCV(estimator = DecisionTreeClassifier(random_state = 0),
                           param_grid = parameters, scoring = 'accuracy',
                           cv = 10, n_jobs = -1)

        classifier.fit(X_train, y_train)
        best_accuracy = classifier.best_score_
        best_parameters = classifier.best_params_
        return classifier, best_accuracy, best_parameters


    def forest(X_train, y_train):
        parameters = [{'n_estimators':[20,40,60,80], 'criterion':['gini','entropy'], 'max_features':['auto','log2']}]

        classifier = GridSearchCV(estimator = RandomForestClassifier(random_state = 0),
                           param_grid = parameters, scoring = 'accuracy',
                           cv = 10, n_jobs = -1)

        classifier.fit(X_train, y_train)
        best_accuracy = classifier.best_score_
        best_parameters = classifier.best_params_
        return classifier, best_accuracy, best_parameters


    def SV(X_train, y_train):
        parameters = [{'kernel': ['linear','poly','sigmoid'], 'gamma': [0.001, 0.0001], 'C':[10,1.0,0.1,0.01]}]

        classifier = GridSearchCV(estimator = SVC(random_state = 0),
                           param_grid = parameters, scoring = 'accuracy',
                           cv = 10, n_jobs = -1)

        classifier.fit(X_train, y_train)
        best_accuracy = classifier.best_score_
        best_parameters = classifier.best_params_
        return classifier, best_accuracy, best_parameters


    def XGB(X_train, y_train):
        parameters = [{'max_depth':[4,6,8,10], 'n_estimators':[20,40,60,80]}]

        classifier = GridSearchCV(estimator = XGBClassifier(random_state = 0),
                           param_grid = parameters, scoring = 'accuracy',
                           cv = 10, n_jobs = -1)

        classifier.fit(X_train, y_train)
        best_accuracy = classifier.best_score_
        best_parameters = classifier.best_params_
        return classifier, best_accuracy, best_parameters

In [4]:
data = 'airline_sentiment_analysis.csv'
cv, X_train, X_test, y_train, y_test = model.reviews(data)
logit_classifier, logit_acc, logit_param = model.logit(X_train, y_train)
print(logit_acc, logit_param)
tree_classifier, tree_acc, tree_param = model.tree(X_train, y_train)
print(tree_acc, tree_param)
forest_classifier, forest_acc, forest_param = model.forest(X_train, y_train)
print(forest_acc, forest_param)
xgb_classifier, xgb_acc, xgb_param = model.XGB(X_train, y_train)
print(xgb_acc, xgb_param)
svc_classifier, svc_acc, svc_param = model.SV(X_train, y_train)
print(svc_acc, svc_param)

0.9146453311946269 {'C': 1.0, 'penalty': 'l2', 'solver': 'saga'}
0.8381721564820156 {'criterion': 'entropy', 'max_features': 'sqrt', 'splitter': 'best'}
0.9045706640777063 {'criterion': 'entropy', 'max_features': 'log2', 'n_estimators': 60}
0.904029304029304 {'max_depth': 10, 'n_estimators': 80}
0.9064136567657695 {'C': 1.0, 'gamma': 0.001, 'kernel': 'linear'}
