In [5]:
import pandas as pd
import numpy as np
import bz2

In [6]:
def get_labels_texts(file):
    labels = []
    texts  = []
    for line in bz2.BZ2File(file):
        x = line.decode('utf-8')
        labels.append(int(x[9])-1)
        texts.append(x[10:].strip())
        
    return np.array(labels), texts

train_labels, train_texts = get_labels_texts("../Data/amazon/train.ft.txt.bz2")
test_labels, test_texts = get_labels_texts("../Data/amazon/test.ft.txt.bz2")

FileNotFoundError: [Errno 2] No such file or directory: '../Data/amazon/train.ft.txt.bz2'

In [3]:
train_labels[0]

1

In [4]:
train_texts[0]

'Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^'

In [5]:
small_train_labels = train_labels[0:500]
small_train_texts  = train_texts[0:500]

## Text PreProcessing

In [6]:
import re

NON_ALPHANUMERIC = re.compile(r'[\W]')
NON_ASCII        = re.compile(r'[^a-z0-1\S]')

def normalize_text(texts):
    normalized_list = []
    for text in texts:
        lower = text.lower()
        no_punctuation = NON_ALPHANUMERIC.sub(r' ',lower)
        no_non_ascii   = NON_ASCII.sub(r' ',no_punctuation)
        
        # split and join to remove extra spaces between words
        clean_text = no_non_ascii.split(" ")
        clean_text = " ".join([word for word in clean_text if word])
        
        normalized_list.append(clean_text)
        
    return normalized_list

In [7]:
small_train_texts = normalize_text(small_train_texts)

In [8]:
small_train_texts[0]

'stuning even for the non gamer this sound track was beautiful it paints the senery in your mind so well i would recomend it even to people who hate vid game music i have played the game chrono cross but out of all of the games i have ever played it has the best music it backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras it would impress anyone who cares to listen _'

## Count Vectorizer

In [9]:
from sklearn.feature_extraction.text import CountVectorizer 

cv = CountVectorizer(binary=True)

In [10]:
cv.fit(small_train_texts)

CountVectorizer(binary=True)

In [11]:
X = cv.transform(small_train_texts)

In [12]:
X_test  =  cv.transform(test_texts)

In [13]:
X_test

<400000x5853 sparse matrix of type '<class 'numpy.int64'>'
	with 18679328 stored elements in Compressed Sparse Row format>

In [18]:
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score 
from sklearn.model_selection import train_test_split 

X_train , X_val, y_train, y_val = train_test_split(X, small_train_labels, train_size = 0.75)

for c in [0.01, 0.025, 0.05, 0.75, 1]:
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print(f'Accuracy for C:{c}, {accuracy_score(y_val, lr.predict(X_val))}')

Accuracy for C:0.01, 0.76
Accuracy for C:0.025, 0.752
Accuracy for C:0.05, 0.752
Accuracy for C:0.75, 0.768
Accuracy for C:1, 0.768


In [23]:
lr.predict(X_test[9])

array([1])

In [24]:
test_labels[9]

0

In [4]:
# TFIdF with ngrame 

from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(min_df=5, ngram_range=(1,2)).fit(X_train)
X_train_vectorized = vect.transform(X_train)
len(vect.get_feature_names())

NameError: name 'X_train' is not defined

In [None]:
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

pred = model.predict(vect.transform(X_test))

roc_auc_score(y_test, pred)

In [None]:
feature_names = np.array(vect.get_feature_names())

sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_tfidf_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_tfidf_index[:-11:-1]]))

In [None]:
# Test
print(model.predict(vect.transform(['not an issue, phone is working', 
                                    'an issue, phone is not working'])))

In [None]:
# Crerating pipelines with Grid Search CV

from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier

In [None]:
model_params = {
    'svm': {
        'model': svm.SVC(gamma='auto'),
        'params' : {
            'C': [1,10,20],
            'kernel': ['rbf','linear']
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1,5,10]
        }
    },
    'naive_bayes_gaussian': {
        'model': GaussianNB(),
        'params': {}
    },
    'naive_bayes_multinomial': {
        'model': MultinomialNB(),
        'params': {}
    },
    'decision_tree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'criterion': ['gini','entropy'],
            
        }
    }     
}

In [None]:
from sklearn.model_selection import GridSearchCV
import pandas as pd
scores = []

for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(X_train, y_train)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df

# TEST CASE

In [34]:
test  =  cv.transform(['this is a not a good product!, but it crashed the next day making my heart go'])

In [41]:
lr.predict(X_test[:50])

array([1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0])

In [44]:
test_labels[0:50]

array([1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 1, 0, 0, 0, 1])

In [47]:
accuracy_score(test_labels[150:250], lr.predict(X_test[150:250]))

0.73