# Logistic Regression Modeling-- Binary Classification

In [1]:
#Import necessary libraries 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

from nltk.corpus import stopwords

#For streamlit app
import pickle

In [2]:
#import the dataset
friends = pd.read_csv('../../../Datasets/friends-modeling.csv')
friends.head()

Unnamed: 0,season,episode,character,dialogue
0,s01,e01,Monica Geller,There's nothing to tell! He's just some guy I ...
1,s01,e01,Joey Tribbiani,"C'mon, you're going out with the guy! There's ..."
2,s01,e01,Chandler Bing,"All right Joey, be nice. So does he have a hum..."
3,s01,e01,Phoebe Buffay,"Wait, does he eat chalk?"
4,s01,e01,Phoebe Buffay,"Just, 'cause, I don't want her to go through w..."


In [3]:
#Make sure no nulls
friends.isnull().sum()

season       0
episode      0
character    0
dialogue     0
dtype: int64

### Only Predicting the Couples! 
----
First, Rachel and Ross

In [4]:
friends[(friends['character'] == 'Ross Geller') | (friends['character'] == 'Rachel Green')].shape

(15899, 4)

In [5]:
#Dropping all characters but Rachel and Ross
rach_ross = friends[(friends['character'] == 'Ross Geller') | (friends['character'] == 'Rachel Green')]

### Make X and y values 

In [6]:
X_rr = rach_ross['dialogue']
y_rr = rach_ross['character']

### Split into Train and Test

In [7]:
X_train_rr, X_test_rr, y_train_rr, y_test_rr = train_test_split(X_rr, y_rr, random_state=42, stratify=y_rr)

### Baseline Accuracy 

Baseline to beat when making the models

In [8]:
y_train_rr.value_counts(normalize=True)

Rachel Green    0.500755
Ross Geller     0.499245
Name: character, dtype: float64

### Instaniating Count Vectorizer, Fit and Transform 

In [9]:
cv = CountVectorizer()

Fitting only on the training data and transforming only on X_train and X_test

In [10]:
cv.fit(X_train_rr)

CountVectorizer()

In [11]:
X_train_cv_rr = cv.transform(X_train_rr)
X_test_cv_rr = cv.transform(X_test_rr)

### Modeling 

Instantiate a logisitic regression model with an instance of the class LogisticRegression.

In [12]:
logreg = LogisticRegression(max_iter=1000, random_state=42)

In [13]:
#fitting on the training set -- need to pass in X_train_cv!
logreg.fit(X_train_cv_rr, y_train_rr)

LogisticRegression(max_iter=1000, random_state=42)

In [14]:
#Scoring on the training and testing sets to see if there is overfitting or underfitting.
print(f'Train score: {logreg.score(X_train_cv_rr, y_train_rr)} \nTest score: {logreg.score(X_test_cv_rr, y_test_rr)}')

Train score: 0.782120093928212 
Test score: 0.6407547169811321



---
**Making Predictions**

In [15]:
#Making predictions using X_test_cv
preds_rr = logreg.predict(X_test_cv_rr)

I created a dataframe consisting of the predicted results, actual results, and the dialouge. 

In [16]:
df_params_rr = pd.DataFrame(y_test_rr)
df_params_rr['predictions'] = preds_rr
df_params_rr['dialogue'] = X_test_rr
df_params_rr.rename(columns={'character': 'actual'}, inplace=True)
df_params_rr.head(10)

Unnamed: 0,actual,predictions,dialogue
16472,Rachel Green,Ross Geller,"Well, apparently he scares easy."
33805,Rachel Green,Ross Geller,"Oh, you poor little famous man."
9632,Ross Geller,Rachel Green,"Nothing I do means anything, really."
18651,Ross Geller,Ross Geller,"Oh-oh-ooh, hey guys, I was wondering if you gu..."
39215,Ross Geller,Ross Geller,"Hey hey, can I help?"
42316,Rachel Green,Rachel Green,If you hold a spider.
11762,Rachel Green,Rachel Green,"Oh, okay."
25566,Rachel Green,Rachel Green,Do you guys know any cute guys?
41566,Ross Geller,Rachel Green,"C'mon you guys, this is really important to us."
1218,Ross Geller,Ross Geller,I'm going to do it.


In [17]:
df_params_rr['predictions'].value_counts()

Rachel Green    2002
Ross Geller     1973
Name: predictions, dtype: int64

In [18]:
df_params_rr['actual'].value_counts()

Rachel Green    1990
Ross Geller     1985
Name: actual, dtype: int64

In [19]:
#How many rows were missclassified?
df_params_rr.loc[df_params_rr['actual']!= df_params_rr['predictions']].count()

actual         1428
predictions    1428
dialogue       1428
dtype: int64

In [20]:
#How many rows were accurately predicted?
df_params_rr.loc[df_params_rr['actual']== df_params_rr['predictions']].count()

actual         2547
predictions    2547
dialogue       2547
dtype: int64

---
**Predicting Some Phrases**

In [21]:
#logreg.predict(["How you doin'?"])[0]

In [22]:
#logreg.predict(['Smelly cat, smelly cat, what are they feeding you'])[0]

In [23]:
#logreg.predict(['We were on a break!'])[0]


<br>

-----
### Setting up a Pipe for all Logisitic Regression Modeling

In [24]:
pipe = Pipeline(steps=[('cv', CountVectorizer()),
                      ('log', LogisticRegression(random_state=42))])

### Modeling: Basic Model with Default Parameters and using CountVectorizer

In [25]:
grid = {'cv__stop_words':[None, 'english'],
         'log__max_iter': [200, 250, 300, 350, 400, 450, 500]}

In [26]:
# Instaniate a gridSearch 
gs_rr = GridSearchCV(estimator=pipe, param_grid=grid)
gs_rr.fit(X_train_rr, y_train_rr)

GridSearchCV(estimator=Pipeline(steps=[('cv', CountVectorizer()),
                                       ('log',
                                        LogisticRegression(random_state=42))]),
             param_grid={'cv__stop_words': [None, 'english'],
                         'log__max_iter': [200, 250, 300, 350, 400, 450, 500]})

In [27]:
gs_rr.best_params_

{'cv__stop_words': None, 'log__max_iter': 200}

In [28]:
print(f"Train score: {gs_rr.score(X_train_rr, y_train_rr)} \nTest score: {gs_rr.score(X_test_rr, y_test_rr)}")

Train score: 0.782120093928212 
Test score: 0.6407547169811321



---
**Making Predictions**

In [29]:
#Making predictions
preds_gs_rr = gs_rr.predict(X_test_rr)

I created a dataframe consisting of the predicted results, actual results, and the dialouge. 

In [30]:
df_gs_rr = pd.DataFrame(y_test_rr)
df_gs_rr['predictions'] = preds_gs_rr 
df_gs_rr['dialogue'] = X_test_rr
df_gs_rr.rename(columns={'character': 'actual'}, inplace=True)
df_gs_rr.head(10)

Unnamed: 0,actual,predictions,dialogue
16472,Rachel Green,Ross Geller,"Well, apparently he scares easy."
33805,Rachel Green,Ross Geller,"Oh, you poor little famous man."
9632,Ross Geller,Rachel Green,"Nothing I do means anything, really."
18651,Ross Geller,Ross Geller,"Oh-oh-ooh, hey guys, I was wondering if you gu..."
39215,Ross Geller,Ross Geller,"Hey hey, can I help?"
42316,Rachel Green,Rachel Green,If you hold a spider.
11762,Rachel Green,Rachel Green,"Oh, okay."
25566,Rachel Green,Rachel Green,Do you guys know any cute guys?
41566,Ross Geller,Rachel Green,"C'mon you guys, this is really important to us."
1218,Ross Geller,Ross Geller,I'm going to do it.


In [31]:
df_gs_rr['predictions'].value_counts()

Rachel Green    2002
Ross Geller     1973
Name: predictions, dtype: int64

In [32]:
df_gs_rr['actual'].value_counts()

Rachel Green    1990
Ross Geller     1985
Name: actual, dtype: int64

In [33]:
#How many rows were missclassified?
df_gs_rr.loc[df_gs_rr['actual']!= df_gs_rr['predictions']].count()

actual         1428
predictions    1428
dialogue       1428
dtype: int64

In [34]:
#How many rows were accurately predicted?
df_gs_rr.loc[df_gs_rr['actual']== df_gs_rr['predictions']].count()

actual         2547
predictions    2547
dialogue       2547
dtype: int64

---
**Predicting Some Phrases**

In [35]:
gs_rr.predict(["Oh, are you setting Ross up with someone? Does she have a wedding dress"])[0]

'Rachel Green'

In [36]:
gs_rr.predict(['We were on a break!'])[0]

'Ross Geller'

<br>

---- 
### Exporting the Model Using Pickle

In [37]:
#Using the best model which was the 
with open('../../log-reg-rachross.pkl', mode='wb') as pickle_out:
    pickle.dump(gs_rr, pickle_out)

<br>
<br>


----
#### Monica and Chandler

In [38]:
friends[(friends['character'] == 'Monica Geller') | (friends['character'] == 'Chandler Bing')].shape

(15081, 4)

In [39]:
#Dropping all characters but Monica and Chandler
mon_chan = friends[(friends['character'] == 'Monica Geller') | (friends['character'] == 'Chandler Bing')]

### Make X and y values 

In [40]:
X_mc = mon_chan['dialogue']
y_mc = mon_chan['character']

### Split into Train and Test

In [41]:
X_train_mc, X_test_mc, y_train_mc, y_test_mc = train_test_split(X_mc, y_mc, random_state=42, stratify=y_mc)

### Baseline Accuracy 

Baseline to beat when making the models

In [42]:
y_train_mc.value_counts(normalize=True)

Chandler Bing    0.503183
Monica Geller    0.496817
Name: character, dtype: float64

### Instaniating Count Vectorizer, Fit and Transform 

In [43]:
cv_mc = CountVectorizer()

Fitting only on the training data and transforming only on X_train and X_test

In [44]:
cv_mc.fit(X_train_mc)

CountVectorizer()

In [45]:
X_train_cv_mc = cv.transform(X_train_mc)
X_test_cv_mc = cv.transform(X_test_mc)

### Modeling 

Instantiate a logisitic regression model with an instance of the class LogisticRegression.

In [46]:
logreg = LogisticRegression(max_iter=1000, random_state=42)

In [47]:
#fitting on the training set -- need to pass in X_train_cv!
logreg.fit(X_train_cv_mc, y_train_mc)

LogisticRegression(max_iter=1000, random_state=42)

In [48]:
#Scoring on the training and testing sets to see if there is overfitting or underfitting.
print(f'Train score: {logreg.score(X_train_cv_mc, y_train_mc)} \nTest score: {logreg.score(X_test_cv_mc, y_test_mc)}')

Train score: 0.7519009725906277 
Test score: 0.6030230708035004



---
**Making Predictions**

In [49]:
#Making predictions using X_test_cv
preds_mc = logreg.predict(X_test_cv_mc)

I created a dataframe consisting of the predicted results, actual results, and the dialouge. 

In [50]:
df_params_mc = pd.DataFrame(y_test_mc)
df_params_mc['predictions'] = preds_mc 
df_params_mc['dialogue'] = X_test_mc
df_params_mc.rename(columns={'character': 'actual'}, inplace=True)
df_params_mc.head(10)

Unnamed: 0,actual,predictions,dialogue
9890,Chandler Bing,Monica Geller,"I'll be a fool for you. I'm sure, you know I d..."
21948,Chandler Bing,Chandler Bing,See you later Mon.
39041,Monica Geller,Chandler Bing,"Wow, that's a big cable bill! Huh, you don't h..."
35763,Monica Geller,Monica Geller,Okay fine! I keep betting Phoebe that you're g...
2703,Chandler Bing,Chandler Bing,"You know that thing, when you and I talk to ea..."
24629,Chandler Bing,Chandler Bing,Yes it was very sad when the guy stopped drawi...
29992,Chandler Bing,Chandler Bing,"Uh, could you leave me one?"
15401,Monica Geller,Monica Geller,"What, he doesn't like Josh?"
39482,Monica Geller,Chandler Bing,Oh! You assume because I was heavy that's the ...
9963,Monica Geller,Chandler Bing,Score!! 7 to nothing!


In [51]:
df_params_mc['predictions'].value_counts()

Chandler Bing    1914
Monica Geller    1857
Name: predictions, dtype: int64

In [52]:
df_params_mc['actual'].value_counts()

Chandler Bing    1897
Monica Geller    1874
Name: actual, dtype: int64

In [53]:
#How many rows were missclassified?
df_params_mc.loc[df_params_mc['actual']!= df_params_mc['predictions']].count()

actual         1497
predictions    1497
dialogue       1497
dtype: int64

In [54]:
#How many rows were accurately predicted?
df_params_mc.loc[df_params_mc['actual']== df_params_mc['predictions']].count()

actual         2274
predictions    2274
dialogue       2274
dtype: int64

---
**Predicting Some Phrases**

In [55]:
#logreg.predict(["How you doin'?"])[0]

In [56]:
#logreg.predict(['Smelly cat, smelly cat, what are they feeding you'])[0]

In [57]:
#logreg.predict(['We were on a break!'])[0]

### Modeling: Basic Model with Default Parameters and using CountVectorizer

In [58]:
# Instaniate a gridSearch 
gs_mc = GridSearchCV(estimator=pipe, param_grid=grid)
gs_mc.fit(X_train_mc, y_train_mc)

GridSearchCV(estimator=Pipeline(steps=[('cv', CountVectorizer()),
                                       ('log',
                                        LogisticRegression(random_state=42))]),
             param_grid={'cv__stop_words': [None, 'english'],
                         'log__max_iter': [200, 250, 300, 350, 400, 450, 500]})

In [59]:
gs_mc.best_params_

{'cv__stop_words': None, 'log__max_iter': 200}

In [60]:
print(f"Train score: {gs_mc.score(X_train_mc, y_train_mc)} \nTest score: {gs_mc.score(X_test_mc, y_test_mc)}")

Train score: 0.7844385499557913 
Test score: 0.6083267037920976



---
**Making Predictions**

In [61]:
#Making predictions
preds_gs_mc = gs_mc.predict(X_test_mc)

I created a dataframe consisting of the predicted results, actual results, and the dialouge. 

In [62]:
df_gs_mc = pd.DataFrame(y_test_mc)
df_gs_mc['predictions'] = preds_gs_mc 
df_gs_mc['dialogue'] = X_test_mc
df_gs_mc.rename(columns={'character': 'actual'}, inplace=True)
df_gs_mc.head(10)

Unnamed: 0,actual,predictions,dialogue
9890,Chandler Bing,Monica Geller,"I'll be a fool for you. I'm sure, you know I d..."
21948,Chandler Bing,Chandler Bing,See you later Mon.
39041,Monica Geller,Chandler Bing,"Wow, that's a big cable bill! Huh, you don't h..."
35763,Monica Geller,Monica Geller,Okay fine! I keep betting Phoebe that you're g...
2703,Chandler Bing,Chandler Bing,"You know that thing, when you and I talk to ea..."
24629,Chandler Bing,Chandler Bing,Yes it was very sad when the guy stopped drawi...
29992,Chandler Bing,Chandler Bing,"Uh, could you leave me one?"
15401,Monica Geller,Monica Geller,"What, he doesn't like Josh?"
39482,Monica Geller,Chandler Bing,Oh! You assume because I was heavy that's the ...
9963,Monica Geller,Monica Geller,Score!! 7 to nothing!


In [63]:
df_gs_mc['predictions'].value_counts()

Chandler Bing    1902
Monica Geller    1869
Name: predictions, dtype: int64

In [64]:
df_gs_mc['actual'].value_counts()

Chandler Bing    1897
Monica Geller    1874
Name: actual, dtype: int64

In [65]:
#How many rows were missclassified?
df_gs_mc.loc[df_gs_mc['actual']!= df_gs_mc['predictions']].count()

actual         1477
predictions    1477
dialogue       1477
dtype: int64

In [66]:
#How many rows were accurately predicted?
df_gs_mc.loc[df_gs_mc['actual']== df_gs_mc['predictions']].count()

actual         2294
predictions    2294
dialogue       2294
dtype: int64

---
**Predicting Some Phrases**

In [67]:
gs_mc.predict(["And remember, if I am harsh with you, it’s only because you’re doing it wrong."])[0]

'Monica Geller'

In [68]:
gs_mc.predict(['I say more dumb things before 9 A.M. than most people say all day.'])[0]

'Chandler Bing'

<br>

---- 
### Exporting the Model Using Pickle

In [69]:
#Using the best model which was the 
with open('../../log-reg-monchan.pkl', mode='wb') as pickle_out:
    pickle.dump(gs_mc, pickle_out)