# Logisitc regression on Amazon reviews dataset

In [115]:
# importing library
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import sqlite3
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer

import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [2]:
# using the SQLite Table to read data.
con = sqlite3.connect('database.sqlite') 

#filtering only positive and negative reviews i.e. 
# not taking into consideration those reviews with Score=3
#creating new datasets after applying filter on reviews dataset

filtered_data = pd.read_sql_query(""" SELECT * FROM Reviews WHERE Score != 3 """, con) 

# Give reviews with Score>3 a positive rating, and reviews with a score<3 a negative rating.
def partition(x):
    if x < 3:
        return 'negative'
    return 'positive'

#changing reviews with score less than 3 to be positive and vice-versa
# with the help of this method returning positive and negative based on the score

actualScore = filtered_data['Score']
positiveNegative = actualScore.map(partition) 
filtered_data['Score'] = positiveNegative

In [3]:
print(filtered_data.shape) #looking at the size of the data
filtered_data.head() # top five reviews, just for understanding 

(525814, 10)


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,positive,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,negative,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,positive,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,negative,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,positive,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [4]:
# delete the reviews which is same on the basis of few features 
final=filtered_data.drop_duplicates(subset={"UserId","ProfileName","Time","Text"}, keep='first', inplace=False)
final.shape # after deleting, look at shape again

(364173, 10)

In [5]:
final.head() # look at top five reviews 

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,positive,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,negative,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,positive,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,negative,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,positive,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [6]:
# As we know that helfulnessnumerator will not  be greater than helpfullness denominator 
# So we will remove that reviews because that reviews no make sense 

final=final[final.HelpfulnessNumerator<=final.HelpfulnessDenominator]

In [7]:
# after removing reviews in above cell which were useless, no make sense
# so look at the reviewsprint(final.shape)
print(final.shape)

#How many positive and negative reviews are present in our dataset?
final['Score'].value_counts()

(364171, 10)


positive    307061
negative     57110
Name: Score, dtype: int64

# Text preprocessing

In [8]:
# Removing Stop-words

stop = set(stopwords.words('english')) #set of stopwords
sno = nltk.stem.SnowballStemmer('english') #initialising the snowball stemmer

def cleanhtml(sentence):  #function to clean the word of any html-tags
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', sentence)
    return cleantext
def cleanpunc(sentence):  #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    return  cleaned

In [9]:
#Code for implementing step-by-step the checks mentioned in the pre-processing phase
# this code takes a while to run as it needs to run on 500k sentences.
i=0
str1=' '
final_string=[]
all_positive_words=[] # store words from +ve reviews here
all_negative_words=[] # store words from -ve reviews here.
s=''
for sent in final['Text'].values:
    filtered_sentence=[]
    
    sent=cleanhtml(sent) # remove HTMl tags
    for w in sent.split():
        for cleaned_words in cleanpunc(w).split():
            if((cleaned_words.isalpha()) & (len(cleaned_words)>2)): # assure that cleaned words are alphabetical and length is greater than 2
                if(cleaned_words.lower() not in stop):  # thos words who were not in stop words
                    s=(sno.stem(cleaned_words.lower())).encode('utf8') # changing cleaned words into lower case
                    filtered_sentence.append(s)
                    if (final['Score'].values)[i] == 'positive':  #IF words are positive 
                        all_positive_words.append(s) #list of all words used to describe positive reviews
                    if(final['Score'].values)[i] == 'negative': # if words are negative
                        all_negative_words.append(s) #list of all words used to describe negative reviews reviews
                else:
                    continue
            else:
                continue 
    
    str1 = b" ".join(filtered_sentence) #final string of cleaned words
    
    
    final_string.append(str1) #final_string dataset appending string after cleaning words
    i+=1

In [10]:
final['CleanedText']=final_string #adding a column of CleanedText which displays the data after pre-processing of the review 
final['CleanedText']=final['CleanedText'].str.decode("utf-8")

In [11]:
final.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,CleanedText
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,positive,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,bought sever vital can dog food product found ...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,negative,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,product arriv label jumbo salt peanut peanut a...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,positive,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,confect around centuri light pillowi citrus ge...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,negative,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,look secret ingredi robitussin believ found go...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,positive,1350777600,Great taffy,Great taffy at a great price. There was a wid...,great taffi great price wide assort yummi taff...


In [12]:
final.shape # look at the shape of final dataset 

(364171, 11)

In [13]:
# sorting data on the basis of time stamp for time based splitting 
sorted_data=final.sort_values('Time', axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')
final=sorted_data

In [14]:
# importing library
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn import cross_validation




# Train test split

In [15]:
# split the data set into train and test
X_tr, X_tes, y_tr, y_test = cross_validation.train_test_split(final['CleanedText'].values,final['Score'], test_size=0.3, random_state=0)

# Bag of words

In [16]:
# we are collecting all the split words in the form of tokens matrix
count_vect = CountVectorizer(min_df=10) #in scikit-learn
X_train = count_vect.fit_transform(X_tr)#giving training data set to vectorize the training data
X_test = count_vect.transform(X_tes) #giving testing data set to vectorize the testing data
print("the type of count vectorizer ",type(X_train))
print("the shape of out text BOW vectorizer ",X_train.get_shape())
print("the number of unique words ", X_train.get_shape()[1])

the type of count vectorizer  <class 'scipy.sparse.csr.csr_matrix'>
the shape of out text BOW vectorizer  (254919, 12703)
the number of unique words  12703


In [17]:
import warnings
warnings.filterwarnings('ignore')

# Data-preprocessing: Standardizing the data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler(with_mean=False)
X_train_vec = sc.fit_transform(X_train)
X_test_vec = sc.transform(X_test)

# Grid search with l2 regularization

In [18]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

tuned_parameters = [{'C': [10**-4, 10**-2, 10**0, 10**2, 10**4]}]


#Using GridSearchCV
model = GridSearchCV(LogisticRegression(), tuned_parameters, scoring = 'accuracy', cv=5,n_jobs=-1)
model.fit(X_train_vec, y_tr)

print(model.best_estimator_)
print(max(model.cv_results_['mean_train_score']))
print(model.score(X_test_vec, y_test))

LogisticRegression(C=0.0001, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)
0.9451678769403882
0.9195255006773332


In [19]:
# training the model with the optimal lambda value 
lr1 = LogisticRegression(C=model.best_params_['C'])
lr1.fit(X_train_vec,y_tr)


LogisticRegression(C=0.0001, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

# to check multicollinearity 

In [20]:
shape1=X_train_vec.nnz # number of non zero element available in sparse matrix
shape1

7939848

In [21]:
a=X_train_vec.nonzero()  # function to get the row and column indices of non zero element in sparse matrix
a

(array([     0,      0,      0, ..., 254918, 254918, 254918], dtype=int32),
 array([ 1477,  3981,  4678, ..., 11321, 11397, 11645], dtype=int32))

In [22]:
# import sparse module from SciPy package 
from scipy import sparse

import sys
data=[] #initializing data to put it in sparse matrix 
row_ind=a[0]  # row indices of non zero element 
col_ind=a[1]# # column indices of non zero element 

# data to be stored in Csr sparse matrix
data[0:shape1]=[0.0001 for i in range(0,shape1)]

# create csr sparse matrix with the help of row,column indices and data 
X_train_vec_csr = sparse.csr_matrix((data, (row_ind, col_ind)))

# shape of new csr matrix
X_train_vec_csr.shape

(254919, 12703)

In [23]:
# new sparse matrix after adding epsilon value in our sparse feature matrix
X_train_vec_csr1=X_train_vec_csr+X_train_vec

In [24]:
# training the model with the optimal lambda value with new sparse matrix features
lr2 = LogisticRegression(C=model.best_params_['C'])
lr2.fit(X_train_vec_csr1,y_tr)


LogisticRegression(C=0.0001, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [25]:
arr1=np.absolute(lr2.coef_-lr1.coef_) #absolute diff of coefficient of both the matrix to check multicollinearity features

In [26]:
arr2=np.sort(arr1.ravel())[::-1] #sorting the array for better understanding of difference in coefficent 

In [27]:
arr2[0:10]  # here is our top difference of coefficient of features

array([1.91293309e-06, 1.78844256e-06, 1.71262469e-06, 1.51676013e-06,
       1.30210137e-06, 1.29873701e-06, 1.12637662e-06, 1.09039557e-06,
       1.08716796e-06, 1.07800618e-06])

# Observations

As we can see above there is little difference between our features sparse matrix coefficient and our second sparse matrix after adding epsilon in coeeficient so we can say there is no collinearity exist in our features so we can find important features as usual 

# Feature importance

In [28]:
arr3=lr1.coef_ #to find weight of most important features
arr2=np.absolute(lr1.coef_) # absolute value for sorting in decreasing order
arr2.shape # shape of weight

(1, 12703)

In [29]:
list1=np.argsort(arr2.ravel())[::-1] #sorting array to find index of most imp feature

In [30]:
list2=count_vect.get_feature_names()
for i in list1[0:5]: 
    print(list2[i])  # most important features
    print(arr3[0,i]) # weight of most important features

great
0.3592922625017345
love
0.2958555610940963
best
0.24476749531484415
good
0.2199514891031782
disappoint
-0.2078621695739689


# Random search with l2 regularization

In [31]:
from sklearn.model_selection import RandomizedSearchCV


param_dist = {'C': [10**-4, 10**-2, 10**0, 10**2, 10**4]}


#Using Randomsearch cv
random_search = RandomizedSearchCV(LogisticRegression(),param_dist,scoring='accuracy',cv=5,n_jobs=-1,n_iter=5)
                               
random_search.fit(X_train_vec, y_tr)

print(random_search.best_estimator_)
print(max(random_search.cv_results_['mean_train_score']))
print(random_search.score(X_test_vec, y_test))

LogisticRegression(C=0.0001, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)
0.9451678769403882
0.9195255006773332


# Grid search with l1 regularization

In [32]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

tuned_parameters = [{'C': [10**-4, 10**-2, 10**0, 10**2, 10**4]}]


#Using GridSearchCV
model = GridSearchCV(LogisticRegression(penalty='l1'), tuned_parameters, scoring = 'accuracy', cv=3,n_jobs=-1,pre_dispatch=2)
model.fit(X_train_vec, y_tr)

print(model.best_estimator_)
print(max(model.cv_results_['mean_train_score']))
print(model.score(X_test_vec, y_test))

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
0.948983793651004
0.9216307252956467


In [33]:
# training the model with the optimal lambda value 
lr1 = LogisticRegression(C=model.best_params_['C'])
lr1.fit(X_train_vec,y_tr)


LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

# to check multicollinearity

In [34]:
shape1=X_train_vec.nnz # number of non zero element available in sparse matrix
shape1

7939848

In [35]:
a=X_train_vec.nonzero() # function to get the row and column indices of non zero element in sparse matrix
a

(array([     0,      0,      0, ..., 254918, 254918, 254918], dtype=int32),
 array([ 1477,  3981,  4678, ..., 11321, 11397, 11645], dtype=int32))

In [36]:
# import sparse module from SciPy package 
from scipy import sparse

import sys
data=[] #initializing data to put it in sparse matrix 
row_ind=a[0]  # row indices of non zero element 
col_ind=a[1]# # column indices of non zero element 

# data to be stored in Csr sparse matrix
data[0:shape1]=[0.0001 for i in range(0,shape1)]

# create csr sparse matrix with the help of row,column indices and data 
X_train_vec_csr = sparse.csr_matrix((data, (row_ind, col_ind)))

# shape of new csr matrix
X_train_vec_csr.shape

(254919, 12703)

In [37]:
# new sparse matrix after adding epsilon value in our sparse feature matrix
X_train_vec_csr1=X_train_vec_csr+X_train_vec

# training the model with the optimal lambda value with new sparse matrix features
lr2 = LogisticRegression(C=model.best_params_['C'])
lr2.fit(X_train_vec_csr1,y_tr)


LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [38]:
arr1=np.absolute(lr2.coef_-lr1.coef_) #absolute diff of coefficient of both the matrix to check multicollinearity features
arr2=np.sort(arr1.ravel())[::-1] #sorting the array for better understanding of difference in coefficent 

In [39]:
arr2[0:10] # here is our top difference of coefficient of features

array([0.00114352, 0.00080803, 0.00061883, 0.00052609, 0.00050936,
       0.00040204, 0.00036598, 0.00031517, 0.00031244, 0.0003102 ])

# Observations

As we can see above there is little difference between our features sparse matrix coefficient and our second sparse matrix after adding epsilon in coeeficient so we can say there is no collinearity exist in our features so we can find important features as usual 

# Feature importance

In [40]:
arr3=lr1.coef_ #to find weight of most important features
arr2=np.absolute(lr1.coef_) # absolute value for sorting in decreasing order
arr2.shape # shape of weight

(1, 12703)

In [41]:
list1=np.argsort(arr2.ravel())[::-1] #sorting array to find index of most imp feature

In [42]:
list2=count_vect.get_feature_names()
for i in list1[0:5]: 
    print(list2[i])  # most important features
    print(arr3[0,i]) # weight of most important features

great
0.8023428526870029
best
0.5904135352417754
love
0.5830848186945969
delici
0.5299930076366527
perfect
0.4759886008911331


# Random search with l1 regularization

In [43]:
from sklearn.model_selection import RandomizedSearchCV


param_dist = {'C': [10**-4, 10**-2, 10**0, 10**2, 10**4]}


#Using Randomsearch cv
random_search = RandomizedSearchCV(LogisticRegression(penalty='l1'),param_dist,scoring='accuracy',cv=3,n_jobs=-1,n_iter=5,pre_dispatch=2)
                                   
random_search.fit(X_train_vec, y_tr)

print(random_search.best_estimator_)
print(max(random_search.cv_results_['mean_train_score']))
print(random_search.score(X_test_vec, y_test))

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
0.9489818323244447
0.9216307252956467


# l1 regularization with different value of C 

In [44]:
clf = LogisticRegression(C=0.1, penalty='l1');
clf.fit(X_train_vec, y_tr);
w = clf.coef_
print(np.count_nonzero(w))


11643


In [45]:
clf = LogisticRegression(C=0.01, penalty='l1');
clf.fit(X_train_vec, y_tr);
w = clf.coef_
print(np.count_nonzero(w))
    

5937


In [46]:
clf = LogisticRegression(C=0.001, penalty='l1');
clf.fit(X_train_vec, y_tr);
w = clf.coef_
print(np.count_nonzero(w))


388


In [47]:
clf = LogisticRegression(C=0.0005, penalty='l1');
clf.fit(X_train_vec, y_tr);
w = clf.coef_
print(np.count_nonzero(w))


173


In [48]:
clf = LogisticRegression(C=0.10, penalty='l1');
clf.fit(X_train_vec, y_tr);
w = clf.coef_
print(np.count_nonzero(w))


11641


# Observations

As we can see above, if C decreases it's mean lambda increases and we can observe that if lambda increases then number of non zero elements decreases it's mean sparsity increases

# Logistic regression on tf-idf

In [49]:
# split the data set into train and test
X_tr, X_tes, y_train, y_test = cross_validation.train_test_split(final['CleanedText'].values, final['Score'], test_size=0.3, random_state=0)

In [50]:
tf_idf_vect = TfidfVectorizer(ngram_range=(1,2), min_df=5)
X_train_tfidf = tf_idf_vect.fit_transform(X_tr)
X_test_tfidf=tf_idf_vect.transform(X_tes)

In [51]:
# Data-preprocessing: Standardizing the data
sc = StandardScaler(with_mean=False)
X_train_tfidf_vec = sc.fit_transform(X_train_tfidf)
X_test_tfidf_vec = sc.transform(X_test_tfidf)

# Grid search with l2 regularization

In [52]:
tuned_parameters = [{'C': [10**-4, 10**-2, 10**0, 10**2, 10**4]}]


#Using GridSearchCV
model = GridSearchCV(LogisticRegression(), tuned_parameters, scoring = 'accuracy', cv=3,n_jobs=-1,pre_dispatch=2)
model.fit(X_train_tfidf_vec, y_train)

print(model.best_estimator_)
print(max(model.cv_results_['mean_train_score']))
print(model.score(X_test_tfidf_vec, y_test))

LogisticRegression(C=0.0001, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)
0.9999882315559058
0.9214751217369018


In [53]:
# training the model with the optimal lambda value 
lr1 = LogisticRegression(C=model.best_params_['C'])
lr1.fit(X_train_tfidf_vec,y_train)


LogisticRegression(C=0.0001, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

#  to check multicollinearity

In [54]:
shape1=X_train_tfidf_vec.nnz # number of non zero element available in sparse matrix
shape1

14483646

In [55]:
a=X_train_tfidf_vec.nonzero() # function to get the row and column indices of non zero element in sparse matrix
a

(array([     0,      0,      0, ..., 254918, 254918, 254918], dtype=int32),
 array([ 36487, 266050, 294819, ...,   5172,  82545, 181159], dtype=int32))

In [56]:
# import sparse module from SciPy package 
from scipy import sparse

import sys
data=[] #initializing data to put it in sparse matrix 
row_ind=a[0]  # row indices of non zero element 
col_ind=a[1]# # column indices of non zero element 

# data to be stored in Csr sparse matrix
data[0:shape1]=[0.0001 for i in range(0,shape1)]

# create csr sparse matrix with the help of row,column indices and data 
X_train_tfidf_vec_csr = sparse.csr_matrix((data, (row_ind, col_ind)))

# shape of new csr matrix
X_train_tfidf_vec_csr.shape

(254919, 305513)

In [57]:
# new sparse matrix after adding epsilon value in our sparse feature matrix
X_train_tfidf_vec_csr1=X_train_tfidf_vec_csr+X_train_tfidf_vec

# training the model with the optimal lambda value with new sparse matrix features
lr2 = LogisticRegression(C=model.best_params_['C'])
lr2.fit(X_train_tfidf_vec_csr1,y_train)


LogisticRegression(C=0.0001, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [58]:
arr1=np.absolute(lr2.coef_-lr1.coef_) #absolute diff of coefficient of both the matrix to check multicollinearity features
arr2=np.sort(arr1.ravel())[::-1] #sorting the array for better understanding of difference in coefficent 

In [59]:
arr2[0:10] # here is our top difference of coefficient of features

array([3.02476438e-05, 2.34309212e-05, 1.97582890e-05, 1.91828160e-05,
       1.75390936e-05, 1.73465807e-05, 1.63124394e-05, 1.51056552e-05,
       1.35875420e-05, 1.35798101e-05])

# Observations

As we can see above there is little difference between our features sparse matrix coefficient and our second sparse matrix after adding epsilon in coeeficient so we can say there is no collinearity exist in our features so we can find important features as usual 

# Feature importance

In [60]:
arr3=lr1.coef_ #to find weight of most important features
arr2=np.absolute(lr1.coef_) # absolute value for sorting in decreasing order
arr2.shape # shape of weight

(1, 305513)

In [61]:
list1=np.argsort(arr2.ravel())[::-1] #sorting array to find index of most imp feature

In [62]:
list2=tf_idf_vect.get_feature_names()
for i in list1[0:5]: 
    print(list2[i])  # most important features
    print(arr3[0,i]) # weight of most important features

great
0.1315125557118817
love
0.12583890714644988
best
0.09391876229981895
good
0.09364575512247429
disappoint
-0.08067444353364092


# Random search with l2 regularization

In [63]:
from sklearn.model_selection import RandomizedSearchCV


param_dist = {'C': [10**-4, 10**-2, 10**0, 10**2, 10**4]}


#Using Randomsearch cv
random_search = RandomizedSearchCV(LogisticRegression(),param_dist,scoring='accuracy',cv=3,n_jobs=-1,n_iter=5,pre_dispatch=2)
                                   
random_search.fit(X_train_tfidf_vec, y_train)

print(random_search.best_estimator_)
print(max(random_search.cv_results_['mean_train_score']))
print(random_search.score(X_test_tfidf_vec, y_test))

LogisticRegression(C=0.0001, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)
0.9999882315559058
0.9214751217369018


# Grid search with l1 regularization

In [64]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

tuned_parameters = [{'C': [10**-4, 10**-2, 10**0, 10**2, 10**4]}]


#Using GridSearchCV
model = GridSearchCV(LogisticRegression(penalty='l1'), tuned_parameters, scoring = 'accuracy', cv=3,n_jobs=-1,pre_dispatch=2)
model.fit(X_train_tfidf_vec, y_train)

print(model.best_estimator_)
print(max(model.cv_results_['mean_train_score']))
print(model.score(X_test_tfidf_vec, y_test))

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
0.9999882315559058
0.9296305788452386


In [65]:
# training the model with the optimal lambda value 
lr1 = LogisticRegression(penalty='l1',C=model.best_params_['C'])
lr1.fit(X_train_tfidf_vec,y_train)


LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

# to check multicollinearity

In [66]:
shape1=X_train_tfidf_vec.nnz # number of non zero element available in sparse matrix
shape1

14483646

In [67]:
a=X_train_tfidf_vec.nonzero() # function to get the row and column indices of non zero element in sparse matrix
a

(array([     0,      0,      0, ..., 254918, 254918, 254918], dtype=int32),
 array([ 36487, 266050, 294819, ...,   5172,  82545, 181159], dtype=int32))

In [68]:
# import sparse module from SciPy package 
from scipy import sparse

import sys
data=[] #initializing data to put it in sparse matrix 
row_ind=a[0]  # row indices of non zero element 
col_ind=a[1]# # column indices of non zero element 

# data to be stored in Csr sparse matrix
data[0:shape1]=[0.0001 for i in range(0,shape1)]

# create csr sparse matrix with the help of row,column indices and data 
X_train_tfidf_vec_csr = sparse.csr_matrix((data, (row_ind, col_ind)))

# shape of new csr matrix
X_train_tfidf_vec_csr.shape

(254919, 305513)

In [69]:
# new sparse matrix after adding epsilon value in our sparse feature matrix
X_train_tfidf_vec_csr1=X_train_tfidf_vec_csr+X_train_tfidf_vec

# training the model with the optimal lambda value with new sparse matrix features
lr2 = LogisticRegression(penalty='l1',C=model.best_params_['C'])
lr2.fit(X_train_tfidf_vec_csr1,y_train)


LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [70]:
arr1=np.absolute(lr2.coef_-lr1.coef_) #absolute diff of coefficient of both the matrix to check multicollinearity features
arr2=np.sort(arr1.ravel())[::-1] #sorting the array for better understanding of difference in coefficent 

In [71]:
arr2[0:10] # here is our top difference of coefficient of features

array([0.00494096, 0.00373867, 0.00373752, 0.00346236, 0.00270984,
       0.0027069 , 0.0025327 , 0.00192536, 0.00191478, 0.00191142])

# Observations

As we can see above there is little difference between our features sparse matrix coefficient and our second sparse matrix after adding epsilon in coeeficient so we can say there is no collinearity exist in our features so we can find important features as usual 

# Feature importance

In [72]:
arr3=lr1.coef_ #to find weight of most important features
arr2=np.absolute(lr1.coef_) # absolute value for sorting in decreasing order
arr2.shape # shape of weight

(1, 305513)

In [73]:
list1=np.argsort(arr2.ravel())[::-1] #sorting array to find index of most imp feature

In [74]:
list2=tf_idf_vect.get_feature_names()
for i in list1[0:5]: 
    print(list2[i])  # most important features
    print(arr3[0,i]) # weight of most important features

great
0.8716541974128199
love
0.6327252052368632
best
0.6100860385058007
delici
0.5168643601019093
perfect
0.4581109688374052


# Random search with l1 regularization

In [75]:
from sklearn.model_selection import RandomizedSearchCV


param_dist = {'C': [10**-4, 10**-2, 10**0, 10**2, 10**4]}


#Using Randomsearch cv
random_search = RandomizedSearchCV(LogisticRegression(penalty='l1'),param_dist,scoring='accuracy',cv=3,n_jobs=-1,n_iter=5,pre_dispatch=2)
                                   
random_search.fit(X_train_tfidf_vec, y_train)

print(random_search.best_estimator_)
print(max(random_search.cv_results_['mean_train_score']))
print(random_search.score(X_test_tfidf_vec, y_test))

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
0.9999882315559058
0.9296580382967817


# l1 regularization with different value of C 

In [76]:
clf = LogisticRegression(C=0.1, penalty='l1');
clf.fit(X_train_tfidf_vec, y_train);
w = clf.coef_
print(np.count_nonzero(w))


48892


In [77]:
clf = LogisticRegression(C=0.01, penalty='l1');
clf.fit(X_train_tfidf_vec, y_train);
w = clf.coef_
print(np.count_nonzero(w))
    

31972


In [78]:
clf = LogisticRegression(C=0.001, penalty='l1');
clf.fit(X_train_tfidf_vec, y_train);
w = clf.coef_
print(np.count_nonzero(w))


613


In [79]:
clf = LogisticRegression(C=0.0005, penalty='l1');
clf.fit(X_train_tfidf_vec, y_train);
w = clf.coef_
print(np.count_nonzero(w))


236


In [80]:
clf = LogisticRegression(C=10, penalty='l1');
clf.fit(X_train_tfidf_vec, y_train);
w = clf.coef_
print(np.count_nonzero(w))


98690


# Observations

As we can see above, if C decreases it's mean lambda increases and we can observe that if lambda increases then number of non zero elements decreases it's mean sparsity increases

# Logistic regression on avgw2v

In [81]:
# split the data set into train and test
X_tr, X_tes, y_train, y_test = cross_validation.train_test_split(final['CleanedText'].values, final['Score'], test_size=0.3, random_state=0)

In [82]:
#for train data we are finding avg w2v for each train data
# Train your own Word2Vec model using your own text corpus
i=0
list_of_sent=[]
for sent in X_tr:
    list_of_sent.append(sent.split())

w2v_model=Word2Vec(list_of_sent,min_count=5,size=50, workers=4)
w2v_words = list(w2v_model.wv.vocab)


In [83]:
X_train_avgw2v = []; # the X_train_avgw2v for each sentence/review is stored in this list
for sent in list_of_sent: # for each review/sentence
    sent_vec = np.zeros(50) # as word vectors are of zero length
    cnt_words =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        if word in w2v_words:
            vec = w2v_model.wv[word]
            sent_vec += vec
            cnt_words += 1
    if cnt_words != 0:
        sent_vec /= cnt_words
    X_train_avgw2v.append(sent_vec)

In [84]:
#for train data we are finding avg w2v for each train data
# Train your own Word2Vec model using your own text corpus
i=0
list_of_sent=[]
for sent in X_tes:
    list_of_sent.append(sent.split())

w2v_model=Word2Vec(list_of_sent,min_count=5,size=50, workers=4)
w2v_words = list(w2v_model.wv.vocab)

In [85]:
X_test = []; # the X_train_avgw2v for each sentence/review is stored in this list
for sent in list_of_sent: # for each review/sentence
    sent_vec = np.zeros(50) # as word vectors are of zero length
    cnt_words =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        if word in w2v_words:
            vec = w2v_model.wv[word]
            sent_vec += vec
            cnt_words += 1
    if cnt_words != 0:
        sent_vec /= cnt_words
    X_test.append(sent_vec)

In [86]:
# Data-preprocessing: Standardizing the data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler(with_mean=False)
X_train_avgw2v_vec = sc.fit_transform(X_train_avgw2v)
X_test_avgw2v_vec = sc.transform(X_test)

# Grid search with l2 regularization

In [87]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

tuned_parameters = [{'C': [10**-4, 10**-2, 10**0, 10**2, 10**4]}]


#Using GridSearchCV
model = GridSearchCV(LogisticRegression(), tuned_parameters, scoring = 'accuracy', cv=3,n_jobs=-1,pre_dispatch=2)
model.fit(X_train_avgw2v_vec, y_train)

print(model.best_estimator_)
print(max(model.cv_results_['mean_train_score']))
print(model.score(X_test_avgw2v_vec, y_test))

LogisticRegression(C=10000, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
0.8973948603840225
0.8456046571229817


# Random search with l2 regularization

In [88]:
from sklearn.model_selection import RandomizedSearchCV


param_dist = {'C': [10**-4, 10**-2, 10**0, 10**2, 10**4]}


#Using Randomsearch cv
random_search = RandomizedSearchCV(LogisticRegression(),param_dist,scoring='accuracy',cv=3,n_jobs=-1,n_iter=5,pre_dispatch=2)
                               
random_search.fit(X_train_avgw2v_vec, y_train)

print(random_search.best_estimator_)
print(max(random_search.cv_results_['mean_train_score']))
print(random_search.score(X_test_avgw2v_vec, y_test))

LogisticRegression(C=10000, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
0.8973948603840225
0.8456046571229817


# Grid search with l1 regularization

In [89]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

tuned_parameters = [{'C': [10**-4, 10**-2, 10**0, 10**2, 10**4]}]


#Using GridSearchCV
model = GridSearchCV(LogisticRegression(penalty='l1'), tuned_parameters, scoring = 'accuracy', cv=3,n_jobs=-1,pre_dispatch=2)
model.fit(X_train_avgw2v_vec, y_train)

print(model.best_estimator_)
print(max(model.cv_results_['mean_train_score']))
print(model.score(X_test_avgw2v_vec, y_test))

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
0.8973497481304076
0.8459433236920147


# Random search with l1 regularization

In [90]:
from sklearn.model_selection import RandomizedSearchCV


param_dist = {'C': [10**-4, 10**-2, 10**0, 10**2, 10**4]}


#Using Randomsearch cv
random_search = RandomizedSearchCV(LogisticRegression(penalty='l1'),param_dist,scoring='accuracy',cv=3,n_jobs=-1,n_iter=5,pre_dispatch=2)
                                
random_search.fit(X_train_avgw2v_vec, y_train)

print(random_search.best_estimator_)
print(max(random_search.cv_results_['mean_train_score']))
print(random_search.score(X_test_avgw2v_vec, y_test))

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
0.8973595551325287
0.8459433236920147


# l1 regularization with different value of C

In [91]:
clf = LogisticRegression(C=0.1, penalty='l1');
clf.fit(X_train_avgw2v_vec, y_train);
w = clf.coef_
print(np.count_nonzero(w))


50


In [92]:
clf = LogisticRegression(C=0.01, penalty='l1');
clf.fit(X_train_avgw2v_vec, y_train);
w = clf.coef_
print(np.count_nonzero(w))
    

48


In [93]:
clf = LogisticRegression(C=0.001, penalty='l1');
clf.fit(X_train_avgw2v_vec, y_train);
w = clf.coef_
print(np.count_nonzero(w))


42


In [94]:
clf = LogisticRegression(C=0.0005, penalty='l1');
clf.fit(X_train_avgw2v_vec, y_train);
w = clf.coef_
print(np.count_nonzero(w))


32


In [95]:
clf = LogisticRegression(C=10, penalty='l1');
clf.fit(X_train_avgw2v_vec, y_train);
w = clf.coef_
print(np.count_nonzero(w))


50


# Observations

As we can see above, if C decreases it's mean lambda increases and we can observe that if lambda increases then number of non zero elements decreases it's mean sparsity increases

# Logistic regression on tf-idfw2v

In [96]:
my_final=final.sample(100000)

In [97]:
# split the data set into train and test
X_1, X_tes, y_1, y_test = cross_validation.train_test_split(my_final['CleanedText'].values, my_final['Score'], test_size=0.3, random_state=0)

In [98]:
#for train data 
tf_idf_vect = TfidfVectorizer()
final_tf_idf = tf_idf_vect.fit_transform(X_1)

In [99]:
# Train your own tfidf Word2Vec model using your own text corpus
i=0
list_of_sent=[]
for sent in X_1:
    list_of_sent.append(sent.split())
    
w2v_model=Word2Vec(list_of_sent,min_count=5,size=50, workers=4)
w2v_words = list(w2v_model.wv.vocab)

In [100]:
# TF-IDF weighted Word2Vec
tfidf_feat = tf_idf_vect.get_feature_names() # tfidf words/col-names
# final_tf_idf is the sparse matrix with row= sentence, col=word and cell_val = tfidf

X_train_tfidfw2v = []; # the tfidf-w2v for each sentence/review is stored in this list
row=0;
for sent in list_of_sent: # for each review/sentence 
    sent_vec = np.zeros(50) # as word vectors are of zero length
    weight_sum =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        if word in w2v_words:
            vec = w2v_model.wv[word]
            # obtain the tf_idfidf of a word in a sentence/review
            tf_idf = final_tf_idf[row, tfidf_feat.index(word)]
            sent_vec += (vec * tf_idf)
            weight_sum += tf_idf
    if weight_sum != 0:
        sent_vec /= weight_sum
    X_train_tfidfw2v.append(sent_vec)
    row += 1

In [101]:
#for train data 
tf_idf_vect = TfidfVectorizer()
final_tf_idf = tf_idf_vect.fit_transform(X_tes)

In [102]:
# Train your own tfidf Word2Vec model using your own text corpus
i=0
list_of_sent=[]
for sent in X_tes:
    list_of_sent.append(sent.split())
w2v_model=Word2Vec(list_of_sent,min_count=5,size=50, workers=4)
w2v_words = list(w2v_model.wv.vocab)

In [103]:
# TF-IDF weighted Word2Vec
tfidf_feat = tf_idf_vect.get_feature_names() # tfidf words/col-names
# final_tf_idf is the sparse matrix with row= sentence, col=word and cell_val = tfidf

X_test = []; # the tfidf-w2v for each sentence/review is stored in this list
row=0;
for sent in list_of_sent: # for each review/sentence 
    sent_vec = np.zeros(50) # as word vectors are of zero length
    weight_sum =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        if word in w2v_words:
            vec = w2v_model.wv[word]
            # obtain the tf_idfidf of a word in a sentence/review
            tf_idf = final_tf_idf[row, tfidf_feat.index(word)]
            sent_vec += (vec * tf_idf)
            weight_sum += tf_idf
    if weight_sum != 0:
        sent_vec /= weight_sum
    X_test.append(sent_vec)
    row += 1

In [104]:
# Data-preprocessing: Standardizing the data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler(with_mean=False)
X_train_tfidfw2v_vec = sc.fit_transform(X_train_tfidfw2v)
X_test_tfidfw2v_vec = sc.transform(X_test)

# Grid search with l2 regularization

In [105]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

tuned_parameters = [{'C': [10**-4, 10**-2, 10**0, 10**2, 10**4]}]


#Using GridSearchCV
model = GridSearchCV(LogisticRegression(), tuned_parameters, scoring = 'accuracy', cv=3,n_jobs=-1,pre_dispatch=2)
model.fit(X_train_tfidfw2v_vec, y_1)

print(model.best_estimator_)
print(max(model.cv_results_['mean_train_score']))
print(model.score(X_test_tfidfw2v_vec, y_test))

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
0.8777642810079015
0.843


# Random search with l2 regularization

In [106]:
from sklearn.model_selection import RandomizedSearchCV


param_dist = {'C': [10**-4, 10**-2, 10**0, 10**2, 10**4]}


#Using Randomsearch cv
random_search = RandomizedSearchCV(LogisticRegression(),param_dist,scoring='accuracy',cv=3,n_jobs=-1,n_iter=5,pre_dispatch=2)
                                 
random_search.fit(X_train_tfidfw2v_vec, y_1)

print(random_search.best_estimator_)
print(max(random_search.cv_results_['mean_train_score']))
print(random_search.score(X_test_tfidfw2v_vec, y_test))

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
0.8777642810079015
0.843


# Grid search with l1 regularization

In [107]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

tuned_parameters = [{'C': [10**-4, 10**-2, 10**0, 10**2, 10**4]}]


#Using GridSearchCV
model = GridSearchCV(LogisticRegression(penalty='l1'), tuned_parameters, scoring = 'accuracy', cv=3,n_jobs=-1,pre_dispatch=2)
model.fit(X_train_tfidfw2v_vec, y_1)

print(model.best_estimator_)
print(max(model.cv_results_['mean_train_score']))
print(model.score(X_test_tfidfw2v_vec, y_test))

LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
0.8777357105487219
0.843


# Random search with l1 regularization

In [108]:
from sklearn.model_selection import RandomizedSearchCV


param_dist = {'C': [10**-4, 10**-2, 10**0, 10**2, 10**4]}


#Using Randomsearch cv
random_search = RandomizedSearchCV(LogisticRegression(penalty='l1'),param_dist,scoring='accuracy',cv=3,n_jobs=-1,n_iter=5,pre_dispatch=2)
                                   
random_search.fit(X_train_tfidfw2v_vec, y_1)

print(random_search.best_estimator_)
print(max(random_search.cv_results_['mean_train_score']))
print(random_search.score(X_test_tfidfw2v_vec, y_test))

LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
0.8777428532017822
0.843


# l1 regularization with different value of C

In [109]:
clf = LogisticRegression(C=0.1, penalty='l1');
clf.fit(X_train_tfidfw2v_vec, y_1);
w = clf.coef_
print(np.count_nonzero(w))


49


In [110]:
clf = LogisticRegression(C=0.01, penalty='l1');
clf.fit(X_train_tfidfw2v_vec, y_1);
w = clf.coef_
print(np.count_nonzero(w))
    

44


In [111]:
clf = LogisticRegression(C=0.001, penalty='l1');
clf.fit(X_train_tfidfw2v_vec, y_1);
w = clf.coef_
print(np.count_nonzero(w))


24


In [112]:
clf = LogisticRegression(C=0.0005, penalty='l1');
clf.fit(X_train_tfidfw2v_vec, y_1);
w = clf.coef_
print(np.count_nonzero(w))


18


In [113]:
clf = LogisticRegression(C=10, penalty='l1');
clf.fit(X_train_tfidfw2v_vec, y_1);
w = clf.coef_
print(np.count_nonzero(w))

50


# Observations

As we can see above, if C decreases it's mean lambda increases and we can observe that if lambda increases then number of non zero elements decreases it's mean sparsity increases

# Table with model,lambda,train error and test error

In [114]:
from prettytable import PrettyTable
    
x = PrettyTable()

x.field_names = ["Model", "Optimal lambda", "Train error", "Test error"]

x.add_row(["GRIDSEARCHCV_L2(BOW)", 0.0001,0.06,9])
x.add_row(["RANDOMSEARCHCV_L2(BOW)", 0.001,0.06,9])
x.add_row(["GRIDSEARCHCV_L1(BOW)", 0.01,0.06,8])
x.add_row(["RANDOMSEARCHCV_L1(BOW)", 0.01,0.06,8])
x.add_row(["GRIDSEARCHCV_L2(tfidf)", 0.0001,0.01,8])
x.add_row(["RANDOMSEARCHCV_L2(tfidf)", 0.0001,0.01,8])
x.add_row(["GRIDSEARCHCV_L1(tfidf)", 0.01,0.01,8])
x.add_row(["RANDOMSEARCHCV_L1(tfidf)",  0.01,0.01,8])
x.add_row(["GRIDSEARCHCV_L2(AVGW2V)", 10000,0.11,15])
x.add_row(["RANDOMSEARCHCV_L2(AVGW2V)",10000,0.11,15])
x.add_row(["GRIDSEARCHCV_L1(AVGW2V)", 10000,0.11,15])
x.add_row(["RANDOMSEARCHCV_L1(AVGW2V)", 100,0.11,15])
x.add_row(["GRIDSEARCHCV_L2(TFIDFW2V)", 1,0.13,17])
x.add_row(["RANDOMSEARCHCV_L2(TFIDFW2V)", 1,0.13,17])
x.add_row(["GRIDSEARCHCV_L1(TFIDFW2V)", 100,0.13,17])
x.add_row(["RANDOMSEARCHCV_L1(TFIDFW2V)", 100,0.13,17])

print(x)

+-----------------------------+----------------+-------------+------------+
|            Model            | Optimal lambda | Train error | Test error |
+-----------------------------+----------------+-------------+------------+
|     GRIDSEARCHCV_L2(BOW)    |     0.0001     |     0.06    |     9      |
|    RANDOMSEARCHCV_L2(BOW)   |     0.001      |     0.06    |     9      |
|     GRIDSEARCHCV_L1(BOW)    |      0.01      |     0.06    |     8      |
|    RANDOMSEARCHCV_L1(BOW)   |      0.01      |     0.06    |     8      |
|    GRIDSEARCHCV_L2(tfidf)   |     0.0001     |     0.01    |     8      |
|   RANDOMSEARCHCV_L2(tfidf)  |     0.0001     |     0.01    |     8      |
|    GRIDSEARCHCV_L1(tfidf)   |      0.01      |     0.01    |     8      |
|   RANDOMSEARCHCV_L1(tfidf)  |      0.01      |     0.01    |     8      |
|   GRIDSEARCHCV_L2(AVGW2V)   |     10000      |     0.11    |     15     |
|  RANDOMSEARCHCV_L2(AVGW2V)  |     10000      |     0.11    |     15     |
|   GRIDSEAR

# Observations

1. Let's talk about the objective of this assignment first, In this assignment, we have to find optimal lambda with the help of gridsearchcv and random search cv with estimator logistic regression and we have the dataset which is amazon fine food reviews dataset which is high dimensional dataset. and also with l1 regularization and l2 regularization to find optimal lambda.
2. Then we have to check the sparsity and the variation of sparsity mean number of non zero element after changing the value of optimal lambda and in this assignment we will also find the feature importance after checkng multicollinearity is there multicollinearity exist or not.Multicollinearity is a statistical phenomenon in which two or more predictor variables in a multiple logistic regression model are highly correlated or associated.
3. Now talk about the solution of this assignment first of all we load the data as usual and preprocess the data then we will split the data in to train and test dataset then we will find bag of words with the help of vectorizer then i did the gridsearcv and randomsearchcv to find optimal lambda with l1 regularization and l2 regularization and i checked it for multicollinearity,after checking multicollinearity i find the most important features.
4. Procedure to check the multicollinearity features, first of all i add the epsilon in our sparse matrix then i find the coefficient of both sparse matrix then we find the absolute difference of both the coefficient of sparse matrix if there will be the little bit difference then we can say there is no multicollinearity features exist in our matrix. 
5. Procedure to find the feature importance of our features, then i get the feature names of our sparse matrix and then i find the weight or coeeficient of our features then we sort the coefficient based on index of top most weight present in our data then we can get the top most or important features with the help of index.
6. This all the procedure i did for bag of words,tfidf,avgw2v and tfidfw2v to find optimal lambda with the help of gridsearchcv and randomsearchcv with l1 and l2 regularization. 
7. we got the important or top most features for bag of words and tfidf. and we also checked the variation of sparsity after changing the value of optimal lambda. then we will see that if C decreases it's mean lambda increases and we can observe that if lambda increases then number of non zero elements decreases it's mean sparsity increases. we did it for all the technique to know how our sparse matrix change after changing the value of optimal lambda.
8. And at last, I have draw a table with the help of library which is prettytable that contains all the information including with model, hyperparameter mean optimal lambda,train error and test error for each model that i have trained in our assignment. 