In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline 


In [2]:
imdb_df = pd.read_csv('./datasets/shuffled-full-set-hashed.csv', names=["topic", "text"], header=None, encoding="ISO-8859-1")

In [3]:
def clean_data(inputString):
    try:
        selVal = inputString.split(' ', 1)[0] 
        anyVal = any(char.isdigit() for char in selVal) 
        return anyVal
    except ValueError:
        return False

In [4]:
def summarize_classification(y_test, y_pred):
    acc = accuracy_score(y_test, y_pred,normalize=True)
    num_acc = accuracy_score(y_test, y_pred,normalize=False)
    prec = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    
    print("Length of testing data: ", len(y_test))
    print("accuracy_count : " , num_acc)
    print("accuracy_score : " , acc)
    print("precision_score : " , prec)
    print("recall_score : ", recall)

In [5]:
def sentiment_analysis_pipeline(X, Y):
    pipeline = Pipeline([
     ('count_vectorizer', CountVectorizer()), 
     ('classifier', LogisticRegression(solver='lbfgs',  max_iter=10000))
    ])
    # Split train and test data sets, 8:2
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)
    pipeline.fit(x_train, y_train)
    y_pred = pipeline.predict(x_test)
    summarize_classification(y_test, y_pred) 

In [6]:
def multiclass_classification(X, Y):
    count_vectorizer = CountVectorizer(ngram_range=(2, 2))

    transformed_vector = count_vectorizer.fit_transform(X)
    transformed_vector.shape
    
    tfidf_transformer = TfidfTransformer()
    tfidf_vector = tfidf_transformer.fit_transform(transformed_vector)
    
    X_dense = tfidf_vector.todense()
    
    X_dense.shape
    
    x_train, x_test, y_train, y_test = train_test_split(X_dense, Y, test_size = 0.2)
    x_train.shape, x_test.shape
    y_train.shape, y_test.shape
    clf = GaussianNB().fit(x_train, y_train)
    y_pred = clf.predict(x_test)

In [7]:
imdb_df.shape

(62425, 2)

In [8]:
### clean data, since we can find some missing data in one of the column

In [9]:
imdb_df = imdb_df[imdb_df['topic'].apply(lambda x: clean_data(x)== False)]
imdb_df = imdb_df[imdb_df['text'].apply(lambda x: x != '')]

In [10]:
imdb_df.shape

(62205, 2)

In [11]:
imdb_df = imdb_df.sample(10000, replace=False)

In [12]:
imdb_df.describe()

Unnamed: 0,topic,text
count,10000,9993
unique,14,9935
top,BILL,bf064c332aa1 079935e500e5 1a4dd36c6de0 7efa289...
freq,2994,3


In [13]:
X = imdb_df['text'].astype(str)

Y = imdb_df['topic'].astype(str)

In [14]:
X.head()

45048    60eb2e281af2 b9699ce57810 957b5cf4e65e d388206...
51093    6ca2dd348663 586242498a88 4ffb12504ac6 2dda221...
59331    1b6d0614f2c7 b73e657498f2 f62c5f87f0af 51a0b1f...
42873    6073b9c2d6ef 20d53168dbb6 fdf32f896cc3 113033b...
36511    abca9d18fae2 1efeef7889a4 a9ee836e8303 0562c75...
Name: text, dtype: object

In [15]:
Y.head()

45048                   BILL
51093                 BINDER
59331     NON-RENEWAL NOTICE
42873                   BILL
36511    CANCELLATION NOTICE
Name: topic, dtype: object

In [16]:
sentiment_analysis_pipeline(X, Y)



Length of testing data:  2000
accuracy_count :  1691
accuracy_score :  0.8455
precision_score :  0.8469143098287992
recall_score :  0.8455
