# Logistic Regression Modeling-- Binary Classification

In [1]:
#Import necessary libraries 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

from nltk.corpus import stopwords

#For streamlit app
import pickle

In [2]:
#import the dataset
friends = pd.read_csv('../../Datasets/friends-modeling.csv')
friends.head()

Unnamed: 0,season,episode,character,dialogue
0,s01,e01,Monica Geller,There's nothing to tell! He's just some guy I ...
1,s01,e01,Joey Tribbiani,"C'mon, you're going out with the guy! There's ..."
2,s01,e01,Chandler Bing,"All right Joey, be nice. So does he have a hum..."
3,s01,e01,Phoebe Buffay,"Wait, does he eat chalk?"
4,s01,e01,Phoebe Buffay,"Just, 'cause, I don't want her to go through w..."


In [3]:
#Make sure no nulls
friends.isnull().sum()

season       0
episode      0
character    0
dialogue     0
dtype: int64

### Only Predicting the Couples! 
----
First, Rachel and Ross

In [4]:
friends[(friends['character'] == 'Ross Geller') | (friends['character'] == 'Rachel Green')].shape

(15899, 4)

In [5]:
#Dropping all characters but Rachel and Ross
rach_ross = friends[(friends['character'] == 'Ross Geller') | (friends['character'] == 'Rachel Green')]

### Make X and y values 

In [6]:
X = rach_ross['dialogue']
y = rach_ross['character']

### Split into Train and Test

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

### Baseline Accuracy 

Baseline to beat when making the models

In [8]:
y_train.value_counts(normalize=True)

Rachel Green    0.500755
Ross Geller     0.499245
Name: character, dtype: float64

### Instaniating Count Vectorizer, Fit and Transform 

In [9]:
cv = CountVectorizer()

Fitting only on the training data and transforming only on X_train and X_test

In [None]:
cv.fit(X_train)

In [None]:
X_train_cv = cv.transform(X_train)
X_test_cv = cv.transform(X_test)

### Modeling 

Instantiate a logisitic regression model with an instance of the class LogisticRegression.

In [None]:
logreg = LogisticRegression(max_iter=1000, random_state=42)

In [None]:
#fitting on the training set -- need to pass in X_train_cv!
logreg.fit(X_train_cv, y_train)

In [None]:
#Scoring on the training and testing sets to see if there is overfitting or underfitting.
print(f'Train score: {logreg.score(X_train_cv, y_train)} \nTest score: {logreg.score(X_test_cv, y_test)}')


---
**Making Predictions**

In [None]:
#Making predictions using X_test_cv
preds_rr = logreg.predict(X_test_cv)

I created a dataframe consisting of the predicted results, actual results, and the dialouge. 

In [None]:
df_params_rr = pd.DataFrame(y_test)
df_params_rr['predictions'] = preds_rr 
df_params_rr['dialogue'] = X_test
df_params_rr.rename(columns={'character': 'actual'}, inplace=True)
df_params_rr.head(10)

In [None]:
df_params_rr['predictions'].value_counts()

In [None]:
df_params_rr['actual'].value_counts()

In [None]:
#How many rows were missclassified?
df_params_rr.loc[df_params_rr['actual']!= df_params_rr['predictions']].count()

In [None]:
#How many rows were accurately predicted?
df_params_rr.loc[df_params_rr['actual']== df_params_rr['predictions']].count()

---
**Predicting Some Phrases**

In [None]:
#logreg.predict(["How you doin'?"])[0]

In [None]:
#logreg.predict(['Smelly cat, smelly cat, what are they feeding you'])[0]

In [None]:
#logreg.predict(['We were on a break!'])[0]


<br>

-----
### Setting up a Pipe for all Logisitic Regression Modeling

In [None]:
pipe = Pipeline(steps=[('cv', CountVectorizer()),
                      ('log', LogisticRegression(random_state=42))])

### Modeling: Basic Model with Default Parameters and using CountVectorizer

In [None]:
grid_rr = {'cv__stop_words':[None, 'english'],
         'log__max_iter': [1000, 1250, 1500, 1750, 2000]}

In [None]:
# Instaniate a gridSearch 
gs_rr = GridSearchCV(estimator=pipe, param_grid=grid_rr)
gs_rr.fit(X_train, y_train)

In [None]:
gs_rr.best_params_

In [None]:
print(f"Train score: {gs_rr.score(X_train, y_train)} \nTest score: {gs_rr.score(X_test, y_test)}")


---
**Making Predictions**

In [None]:
#Making predictions
preds_gs = gs_rr.predict(X_test)

I created a dataframe consisting of the predicted results, actual results, and the dialouge. 

In [None]:
df_gs_rr = pd.DataFrame(y_test)
df_gs_rr['predictions'] = preds_gs 
df_gs_rr['dialogue'] = X_test
df_gs_rr.rename(columns={'character': 'actual'}, inplace=True)
df_gs_rr.head(10)

In [None]:
df_gs_rr['predictions'].value_counts()

In [None]:
df_gs_rr['actual'].value_counts()

In [None]:
#How many rows were missclassified?
df_gs_rr.loc[df_gs_rr['actual']!= df_gs_rr['predictions']].count()

In [None]:
#How many rows were accurately predicted?
df_gs_rr.loc[df_gs_rr['actual']== df_gs_rr['predictions']].count()

---
**Predicting Some Phrases**

In [None]:
gs_rr.predict(["Oh, are you setting Ross up with someone? Does she have a wedding dress"])[0]

In [None]:
gs_rr.predict(['We were on a break!'])[0]

<br>

---- 
### Exporting the Model Using Pickle

In [None]:
#Using the best model which was the 
with open('../log-reg-rachross.pkl', mode='wb') as pickle_out:
    pickle.dump(gs_rr, pickle_out)

<br>
<br>


----
#### Monica and Chandler

In [None]:
friends[(friends['character'] == 'Monica Geller') | (friends['character'] == 'Chandler Bing')].shape

In [None]:
#Dropping all characters but Rachel and Ross
mon_chan = friends[(friends['character'] == 'Monica Geller') | (friends['character'] == 'Chandler Bing')]

### Make X and y values 

In [None]:
X_mc = mon_chan['dialogue']
y_mc = mon_chan['character']

In [None]:
X_mc.shape, y_mc.shape

### Split into Train and Test

In [None]:
X_train_mc, X_test_mc, y_train_mc, y_test_mc = train_test_split(X_mc, y_mc, random_state=42, stratify=y)

### Baseline Accuracy 

Baseline to beat when making the models

In [None]:
y_train_mc.value_counts(normalize=True)

### Instaniating Count Vectorizer, Fit and Transform 

In [None]:
cv_mc = CountVectorizer()

Fitting only on the training data and transforming only on X_train and X_test

In [None]:
cv_mc.fit(X_train_mc)

In [None]:
X_train_cc = cv.transform(X_train_mc)
X_test_cc = cv.transform(X_test_mc)

### Modeling 

Instantiate a logisitic regression model with an instance of the class LogisticRegression.

In [None]:
logreg = LogisticRegression(max_iter=1000, random_state=42)

In [None]:
#fitting on the training set -- need to pass in X_train_cv!
logreg.fit(X_train_cc, y_train_mc)

In [None]:
#Scoring on the training and testing sets to see if there is overfitting or underfitting.
print(f'Train score: {logreg.score(X_train_cc, y_train_mc)} \nTest score: {logreg.score(X_test_cc, y_test_mc)}')


---
**Making Predictions**

In [None]:
#Making predictions using X_test_cv
preds_mc = logreg.predict(X_test_cc)

I created a dataframe consisting of the predicted results, actual results, and the dialouge. 

In [None]:
df_params_mc = pd.DataFrame(y_test_mc)
df_params_mc['predictions'] = preds_mc 
df_params_mc['dialogue'] = X_test_mc
df_params_mc.rename(columns={'character': 'actual'}, inplace=True)
df_params_mc.head(10)

In [None]:
df_params_mc['predictions'].value_counts()

In [None]:
df_params_mc['actual'].value_counts()

In [None]:
#How many rows were missclassified?
df_params_mc.loc[df_params_mc['actual']!= df_params_mc['predictions']].count()

In [None]:
#How many rows were accurately predicted?
df_params_mc.loc[df_params_mc['actual']== df_params_mc['predictions']].count()

---
**Predicting Some Phrases**

In [None]:
#logreg.predict(["How you doin'?"])[0]

In [None]:
#logreg.predict(['Smelly cat, smelly cat, what are they feeding you'])[0]

In [None]:
#logreg.predict(['We were on a break!'])[0]


<br>

-----
### Setting up a Pipe for all Logisitic Regression Modeling

In [None]:
pipe = Pipeline(steps=[('cv', CountVectorizer()),
                      ('log', LogisticRegression(random_state=42))])

### Modeling: Basic Model with Default Parameters and using CountVectorizer

In [None]:
grid_mc = {'cv__stop_words':[None, 'english'],
         'log__max_iter': [1000, 1250, 1500, 1750, 2000]}

In [None]:
# Instaniate a gridSearch 
gs_mc = GridSearchCV(estimator=pipe, param_grid=grid_mc)
gs_mc.fit(X_train, y_train)

In [None]:
gs_mc.best_params_

In [None]:
print(f"Train score: {gs_mc.score(X_train, y_train)} \nTest score: {gs_mc.score(X_test, y_test)}")


---
**Making Predictions**

In [None]:
#Making predictions
preds_gs_mc = gs_mc.predict(X_test)

I created a dataframe consisting of the predicted results, actual results, and the dialouge. 

In [None]:
df_gs_mc = pd.DataFrame(y_test)
df_gs_mc['predictions'] = preds_gs_mc 
df_gs_mc['dialogue'] = X_test
df_gs_mc.rename(columns={'character': 'actual'}, inplace=True)
df_gs_mc.head(10)

In [None]:
df_gs_mc['predictions'].value_counts()

In [None]:
df_gs_mc['actual'].value_counts()

In [None]:
#How many rows were missclassified?
df_gs_mc.loc[df_gs_mc['actual']!= df_gs_mc['predictions']].count()

In [None]:
#How many rows were accurately predicted?
df_gs_mc.loc[df_gs_mc['actual']== df_gs_mc['predictions']].count()

---
**Predicting Some Phrases**

In [None]:
gs_mc.predict(["Oh, are you setting Ross up with someone? Does she have a wedding dress"])[0]

In [None]:
gs_mc.predict(['We were on a break!'])[0]

<br>

---- 
### Exporting the Model Using Pickle

In [None]:
#Using the best model which was the 
with open('../log-reg-rachross.pkl', mode='wb') as pickle_out:
    pickle.dump(gs_mc, pickle_out)