In [1]:
#Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

import re
import csv

#import skipthoughts
from autocorrect import spell

from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.metrics import accuracy_score

from sklearn.feature_extraction.text import TfidfVectorizer

random_seed = 42

In [2]:
def translator(user_string):
    user_string = user_string.split(" ")
    j = 0
    for _str in user_string:
        # File path which consists of Abbreviations.
        fileName = "/home/connectwithprakash/Downloads/slang.txt"
        # File Access mode [Read Mode]
        accessMode = "r"
        with open(fileName, accessMode) as myCSVfile:
            # Reading file as CSV with delimiter as "=", so that abbreviation are stored in row[0] and phrases in row[1]
            dataFromFile = csv.reader(myCSVfile, delimiter="=")
            # Removing Special Characters.
            _str = re.sub('[^a-zA-Z0-9-_.]', '', _str)
            for row in dataFromFile:
                # Check if selected word matches short forms[LHS] in text file.
                if _str.upper() == row[0]:
                    # If match found replace it with its appropriate phrase in text file.
                    user_string[j] = row[1]
            myCSVfile.close()
        j = j + 1
    # Replacing commas with spaces for final output.
    #return(' '.join(user_string))
    return(' '.join(map(spell,user_string)))
import time
def clean_data(df):
    df=df.str.replace('@\S+\s','')
    df=df.str.replace('\s\S+www\S+','')
    df = df.str.replace(r'\.+', " ")
    
    for ii in range(df.count()):
        df[ii] = translator(df[ii])
        if ii%100==0:
            print("Done : {}".format(ii))
        
    return(df)


In [3]:
# define skip-thoughts vectorizer class for scikit-learn
class SkipThoughtsVectorizer(object):
    def __init__(self, **kwargs):
        self.model = skipthoughts.load_model()
        self.encoder = skipthoughts.Encoder(self.model)

    def fit_transform(self, raw_documents, y):
        return self.encoder.encode(raw_documents, verbose=False)

    def fit(self, raw_documents, y=None):
        self.fit_transform(raw_documents, y)
        return self

    def transform(self, raw_documents, copy=True):
        return self.fit_transform(raw_documents, None)

In [None]:
#data extraction and cleaning
head = ['Emotions', 'Text']
df_train = pd.read_csv('train_data.csv')
df_train.columns = head
df_train = df_train.replace('empty',np.NaN).dropna().reset_index(drop=True)

X = df_train['Text']
y = df_train['Emotions']

X= clean_data(X)

#Pickling data for future evaluation
pickling_on = open("./EmotionAnalysis_data_X_train.pickle","wb")
pickle.dump(X,pickling_on)
pickling_on.close()

pickling_on = open("./EmotionAnalysis_data_Y.pickle_train","wb")
pickle.dump(y,pickling_on)
pickling_on.close()

In [4]:
pickling_on = open("./EmotionAnalysis_data_X_train.pickle","rb")
X_pickle = pickle.load(pickling_on)
pickling_on.close()

pickling_on = open("./EmotionAnalysis_data_Y_train.pickle","rb")
y_pickle = pickle.load(pickling_on)
pickling_on.close()

In [7]:
#train test split of data
X_train, X_test, y_train, y_test = train_test_split(X_pickle, y_pickle, random_state=random_seed, test_size=0.20, shuffle=True)

In [15]:
#funcion call to get best estimator for finding best param from grid search
best_estimator = LinearSVC()
tfidf = TfidfVectorizer(ngram_range=(1,3))

#pipeline handle to get work done in pipeline manner which first vectorizes all the text and then uses classification on them
model_pipeline = Pipeline([('tfidf', tfidf ),
                          ('estimator', best_estimator)])


#pipeline_skipthought = Pipeline(steps=[('vectorizer', SkipThoughtsVectorizer()),
#                        ('classifier', LogisticRegression())])

#feature_union = ('feature_union', FeatureUnion([
#    ('skipthought', SkipThoughtsVectorizer()),
#    ('tfidf', TfidfVectorizer(ngram_range=(1, 2))),]))
#pipeline_both = Pipeline(steps=[feature_union,
#                        ('classifier', LogisticRegression())])
model_pipeline.fit(X_train,y_train)

model=None
#final pipelined model for job to do
model = model_pipeline

pickling_on = open("./EmotionAnalysis_model.pickle","wb")
pickle.dump(model,pickling_on)
pickling_on.close()

#training score and test score
#print('Training set error : {}'.format(model_grid.best_score_))
pred=model.predict(X_train)
print('Training set Accuracy : {}'.format(accuracy_score(y_train,pred)))
pred=model.predict(X_test)
print('Test set Accuracy     : {}'.format(accuracy_score(y_test,pred)))


Training set Accuracy : 0.9910105657805044
Test set Accuracy     : 0.34026239563809846
