# Logistic Regression Modeling

In [1]:
#Import necessary libraries 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

from nltk.corpus import stopwords

In [2]:
#import the dataset
friends = pd.read_csv('../../Datasets/friends-modeling.csv')
friends.head()

Unnamed: 0,season,episode,character,dialogue
0,s01,e01,Monica Geller,There's nothing to tell! He's just some guy I ...
1,s01,e01,Joey Tribbiani,"C'mon, you're going out with the guy! There's ..."
2,s01,e01,Chandler Bing,"All right Joey, be nice. So does he have a hum..."
3,s01,e01,Phoebe Buffay,"Wait, does he eat chalk?"
4,s01,e01,Phoebe Buffay,"Just, 'cause, I don't want her to go through w..."


In [3]:
#Make sure no nulls
friends.isnull().sum()

season       0
episode      0
character    0
dialogue     0
dtype: int64

### Make X and y values 

In [4]:
X = friends['dialogue']
y = friends['character']

### Split into Train and Test

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [6]:
X_train.shape

(33604,)

In [7]:
y_train.shape

(33604,)

In [8]:
X_test.shape

(11202,)

In [9]:
y_test.shape

(11202,)

### Baseline Accuracy 

In [10]:
y_train.value_counts(normalize=True)

Rachel Green      0.177657
Ross Geller       0.177152
Chandler Bing     0.169355
Monica Geller     0.167242
Joey Tribbiani    0.160725
Phoebe Buffay     0.147869
Name: character, dtype: float64

### Instaniating Count Vectorizer, Fit and Transform 

In [11]:
cv = CountVectorizer()

In [12]:
cv.fit(X_train)

CountVectorizer()

In [13]:
X_train_cv = cv.transform(X_train)
X_test_cv = cv.transform(X_test)

### Modeling 

In [14]:
logreg = LogisticRegression(max_iter=1000, random_state=42)

In [16]:
logreg.fit(X_train_cv, y_train)

LogisticRegression(max_iter=1000, random_state=42)

In [18]:
logreg.score(X_train_cv, y_train), logreg.score(X_test_cv, y_test)

(0.5459469110820141, 0.3064631315836458)

In [20]:
preds_1 = logreg.predict(X_test_cv)

In [21]:
df_params_1 = pd.DataFrame(y_test)
df_params_1['predictions'] = preds_1 
df_params_1['dialogue'] = X_test
df_params_1.rename(columns={'character': 'actual'}, inplace=True)
df_params_1.head(10)

Unnamed: 0,actual,predictions,dialogue
8260,Monica Geller,Ross Geller,Then what's the problem?
12970,Phoebe Buffay,Rachel Green,"Yeah, well, everybody does! I'm a really cool ..."
9682,Rachel Green,Rachel Green,What? What? He's interested in you. He-he like...
22017,Monica Geller,Monica Geller,I've never loved anybody as much as I love you.
5611,Rachel Green,Joey Tribbiani,And I'm in it? Then let me read it.
22331,Joey Tribbiani,Joey Tribbiani,"Yeah, I gotta go! I got an acting job. Like yo..."
18609,Monica Geller,Phoebe Buffay,Great. So the ball is in his court?
23737,Monica Geller,Monica Geller,"Dad, please don't pick your teeth out here! Al..."
35446,Ross Geller,Chandler Bing,"Excellent! Excellent, now-now do you want anot..."
3756,Monica Geller,Monica Geller,How are you?


In [23]:
df_params_1.loc[df_params_1['actual']!= df_params_1['predictions']].count()

actual         7769
predictions    7769
dialogue       7769
dtype: int64

In [24]:
df_params_1.loc[df_params_1['actual']== df_params_1['predictions']].count()

actual         3433
predictions    3433
dialogue       3433
dtype: int64

### Setting up a Pipe for all Logisitic Regression Modeling

In [28]:
pipe = Pipeline(steps=[('cv', CountVectorizer()),
                      ('log', LogisticRegression(random_state=42))])

### Modeling: Basic Model with Default Parameters and using CountVectorizer

In [30]:
grid_d = {'cv__stop_words':[None, 'english'],
         'log__max_iter': [1000]}

In [31]:
# Instaniate a gridSearch object 
gs_d = GridSearchCV(estimator=pipe, param_grid=grid_d)
gs_d.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('cv', CountVectorizer()),
                                       ('log',
                                        LogisticRegression(random_state=42))]),
             param_grid={'cv__stop_words': [None, 'english'],
                         'log__max_iter': [1000]})

In [32]:
gs_d.best_params_

{'cv__stop_words': None, 'log__max_iter': 1000}

In [36]:
gs_d.score(X_train, y_train), gs_d.score(X_test, y_test)

(0.5136202952796839, 0.29730994152046786)