## Logistic Regerssion : Plain data


In [268]:
import pandas as pd
import os
from nltk.corpus import stopwords
from IPython.display import display

from sklearn.feature_extraction.text import TfidfVectorizer,TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score,confusion_matrix,classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model.logistic import LogisticRegression

### Read and Prepare Data

In [269]:
paragraphs = []
label = []
trainDataText = open(os.path.join(os.getcwd(),"TrainData\\TrainData.txt"),'r')

In [270]:
df = pd.DataFrame(columns=['paragraph', 'label'])
for line in trainDataText.readlines():
    line = line.rstrip('\n').split('\t')
    paragraphs.append(line[0])
    l = 1 if line[1] == 'company' else 0
    label.append(1 if line[1] == 'company' else 0)
    df = df.append({'paragraph':line[0],'label' :l},ignore_index=True)

### Run below

In [332]:
ngram = (1,1)
maxf= 800

### Vectorize Data

In [333]:
vectorizer = TfidfVectorizer("english",ngram_range = ngram, max_features = maxf)
X = vectorizer.fit_transform(df['paragraph'])

### Split Data

In [334]:
X_train, X_test, Y_train, Y_test = train_test_split(X, label, stratify=label, test_size=.4)

In [335]:
#{'C': 10, 'solver': 'lbfgs', 'max_iter': 5,} --> found using grid search below
#classifier = LogisticRegression(max_iter=5, solver='lbfgs',C=10)
classifier = LogisticRegression() #  default parameters produce inferior results
classifier.fit(X_train, Y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [336]:
predictions = classifier.predict(X_test)

In [337]:
print ("Accuracy Score : ",accuracy_score(Y_test, predictions))
print ("F1 Score : ",f1_score(Y_test, predictions))

Accuracy Score :  0.8352941176470589
F1 Score :  0.8813559322033898


In [338]:
print ('\nClasification report:\n', classification_report(Y_test, predictions))


Clasification report:
               precision    recall  f1-score   support

           0       1.00      0.58      0.73        33
           1       0.79      1.00      0.88        52

   micro avg       0.84      0.84      0.84        85
   macro avg       0.89      0.79      0.81        85
weighted avg       0.87      0.84      0.82        85



### Grid Search

In [339]:
ngram = (1,1)
maxf= 800

In [340]:
hyper_parameters = [{ 'solver':['warn','lbfgs'], 'C': [1, 10, 100, 1000],'max_iter':[2,5,10,100,1000,2000]}]

In [341]:
g_classifier = GridSearchCV(LogisticRegression(), hyper_parameters, cv=5, scoring='f1')

In [342]:
g_classifier.fit(X_train, Y_train)







GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'C': [1, 10, 100, 1000], 'solver': ['warn', 'lbfgs'], 'max_iter': [2, 5, 10, 100, 1000, 2000]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1', verbose=0)

In [343]:
best_g_param = g_classifier.best_params_
best_g_classifier = g_classifier.best_estimator_
print('best param : ',best_g_param)

best param :  {'C': 100, 'solver': 'lbfgs', 'max_iter': 5}


In [344]:
predictions = g_classifier.predict(X_test)

In [345]:
print ("Accuracy Score : ",accuracy_score(Y_test, predictions))
print ("F1 Score : ",f1_score(Y_test, predictions))

Accuracy Score :  0.9647058823529412
F1 Score :  0.9714285714285713


In [346]:
print ('\nClasification report:\n', classification_report(Y_test, predictions))


Clasification report:
               precision    recall  f1-score   support

           0       0.97      0.94      0.95        33
           1       0.96      0.98      0.97        52

   micro avg       0.96      0.96      0.96        85
   macro avg       0.97      0.96      0.96        85
weighted avg       0.96      0.96      0.96        85



## Grid search helps to noticeably improve performance 