# Employing Various Classification Algorithms to classify emails as Business or Personal

## Importing modules

In [2]:

# Importing libraries
print("Importing libraries ... ")
import pandas as pd
import numpy as np
from collections import Counter
import datetime, time, vaex, email, re

from sklearn.svm import LinearSVC, SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.offline as py
print("All modules are ready")




Importing libraries ... 
All modules are ready


In [3]:

from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.semi_supervised import LabelPropagation
from sklearn.semi_supervised import LabelSpreading
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression 
from sklearn.linear_model import LogisticRegressionCV 
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import RidgeClassifierCV

from sklearn.svm import NuSVC
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier 

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier




## Loading
Load the data into a data structure

In [5]:
# reading dataset
file_path = "/Users/speedy/Desktop/UofT/Course load/INF2179H - Machine Learning/Notebooks/Data/"
file_name = "wtnospam2.csv"
enron = pd.read_csv(file_path + file_name)

# printing the first two lines on the dataset
enron.head(2)

Unnamed: 0,Subject,Date,From(email),To(email),EmailContent,Spam
0,missing,"Mon, 14 May 2001 16:39:00 -0700 (PDT)",phillip.allen@enron.com,tim.belden@enron.com,Here is our forecast\n\n,0
1,Re:,"Fri, 4 May 2001 13:51:00 -0700 (PDT)",phillip.allen@enron.com,john.lavorato@enron.com,Traveling to have a business meeting takes the...,0


In [6]:
# Exploration 1st step
def exploration():
    # Dataset type
    print("Dataset type is: ", type(enron), "\n")
    
    # Data shape
    print("The data shape is: ", enron.shape, "\n")
    
    # Checking for duplicates
    dup = enron.duplicated().sum()
    print("This dataset has ", dup, "duplicated values\n")
    
    # Column names
    print("Column names are: ", enron.columns)
    

exploration()

Dataset type is:  <class 'pandas.core.frame.DataFrame'> 

The data shape is:  (245176, 6) 

This dataset has  103052 duplicated values

Column names are:  Index(['Subject', 'Date', 'From(email)', 'To(email)', 'EmailContent', 'Spam'], dtype='object')


## Manipulating



In [7]:
msg = enron["EmailContent"][2]
msg

'Please cc the following distribution list with updates:\n\nPhillip Allen (pallen@enron.com)\nMike Grigsby (mike.grigsby@enron.com)\nKeith Holst (kholst@enron.com)\nMonique Sanchez\nFrank Ermis\nJohn Lavorato\n\n\nThank you for your help\n\nPhillip Allen\n'

# Data exploration

In [8]:
# Getting unique emails sent by From(Name)
enron.iloc[:, 2].value_counts()


jeff.dasovich@enron.com       6632
kay.mann@enron.com            6422
vince.kaminski@enron.com      6370
pete.davis@enron.com          4502
tana.jones@enron.com          3898
                              ... 
moontiger13@hotmail.com          1
bob.carter@penreco.com           1
hopeful@glay.org                 1
tom.mccall@entergykoch.com       1
david.ingram@enron.com           1
Name: From(email), Length: 13925, dtype: int64

In [9]:
# Getting unique emails received by To(Name)
enron.iloc[:, 3].value_counts() 

missing                                                                                                                                                                                                                                                                                                         7252
pete.davis@enron.com                                                                                                                                                                                                                                                                                            4507
vkaminski@aol.com                                                                                                                                                                                                                                                                                               3232
tana.jones@enron.com                                                     

In [10]:
# Checking null values
enron.isnull().sum()
# email_clean.sum()

Subject         0
Date            0
From(email)     0
To(email)       0
EmailContent    0
Spam            0
dtype: int64

In [11]:
enron.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 245176 entries, 0 to 245175
Data columns (total 6 columns):
Subject         245176 non-null object
Date            245176 non-null object
From(email)     245176 non-null object
To(email)       245176 non-null object
EmailContent    245176 non-null object
Spam            245176 non-null int64
dtypes: int64(1), object(5)
memory usage: 11.2+ MB


In [12]:
enron.describe()

Unnamed: 0,Spam
count,245176.0
mean,0.0
std,0.0
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,0.0


In [13]:
enron.loc[enron["Subject"]=='CONFIDENTIAL'].count()

Subject         13
Date            13
From(email)     13
To(email)       13
EmailContent    13
Spam            13
dtype: int64

In [14]:
top10_from = enron["From(email)"].value_counts()[:10]
top10_to = enron["To(email)"].value_counts()[:10]

In [15]:
top = pd.DataFrame(top10_to)
top

Unnamed: 0,To(email)
missing,7252
pete.davis@enron.com,4507
vkaminski@aol.com,3232
tana.jones@enron.com,2147
jeff.dasovich@enron.com,2087
sara.shackleton@enron.com,1960
louise.kitchen@enron.com,1906
kate.symes@enron.com,1818
gerald.nemec@enron.com,1576
mark.taylor@enron.com,1515


The above plot requires more investigation, but preliminary findings show discrepancies between Sender's name versus Sender's email as well as Receiver's name versus Receiver's email address. It can be caused due to error in our algorithm but it can also mean that the they are not exclusively using email business accounts as values diverge in more than 0.2 in total volume for the most active agents (Kay and Vince for example).

In [17]:
enron.columns

Index(['Subject', 'Date', 'From(email)', 'To(email)', 'EmailContent', 'Spam'], dtype='object')

# ML Pipeline

In [18]:
Counter(enron["Spam"])

Counter({0: 245176})

In [19]:
df = enron[["EmailContent", "Spam"]]
df.columns = ["EmailContent", "Label"]

bmeeting = ["meeting", "schedule", "memo", "conference", "agenda", "date"]
bfollowup = ["agreement", "report", "chart", "announcement", "draft", "change", "procedure", "late", "deadline", 
             "proposal", "contract", "letter", "follow-up", "summary", "supplemental", "approval", "template"]
bconfidential = ["confidential", "privacy", "secret", "topsecret", "alert", "board", "committee", "fraud", "scam",
                 "fbi", "investigation", "access", "proceeds", "donation", "transaction", "liquidation", "risk",
                 "lawsuit", "sue", "violation", "criminal", "criminous", "scandal"]
bgeneral = ["gas", "energy", "power", "development", "global", "organizational", "operation", "customer", "intern", 
            "associate", "hiring", "position", "patenting", "finance", "restruct", "balance", "stock", "transport", 
            "model", "management", "executive", "infrastructure"]
pfamilyfriends = ["girlfriend", "boyfriend", "fwd", "buddy", "whassup", "weekend", "plans", "football",
                  "hunt", "friends", "wife", "husband", "hubby", "drugs", "holiday", "season", "winter", "summer"]

df["Label"]='other' #will add a new label with everything being other as a default
df["Label"]=6
def label(dataset):
    for i in range(len(dataset)):
        for j in range(len(bmeeting)):  
            word=bmeeting[j]
            if word in dataset.iloc[:, 0][i]:#change the column position for email content if needed
#                 dataset.iloc[:, -1][i]='bmeeting'
                dataset.iloc[:, -1][i]= 1
            else:
                pass
        for k in range(len(bfollowup)):
            word=bfollowup[k]
            if word in dataset.iloc[:, 0][i]:#change the column position for email content if needed
#                 dataset.iloc[:, -1][i]='bfollowup' 
                dataset.iloc[:, -1][i]=2
            else:
                pass        
        for k in range(len(bconfidential)):
            word=bconfidential[k]
            if word in dataset.iloc[:, 0][i]:#change the column position for email content if needed
#                 dataset.iloc[:, -1][i]='bconfidential'
                dataset.iloc[:, -1][i]=3
            else:
                pass
        for k in range(len(bgeneral)):
            word=bgeneral[k]
            if word in dataset.iloc[:, 0][i]:#change the column position for email content if needed
#                 dataset.iloc[:, -1][i]='bgeneral'
                dataset.iloc[:, -1][i]=4
            else:pass
        for k in range(len(pfamilyfriends)):
            word=pfamilyfriends[k]
            if word in dataset.iloc[:, 0][i]:#change the column position for email content if needed
#                 dataset.iloc[:, -1][i]='pfamilyfriends'
                dataset.iloc[:, -1][i]=5
    return dataset

enron_final = label(df)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice fro

In [20]:
enron_final.shape

(245176, 2)

In [42]:
# enron_final.to_csv("finalLabel.csv")

In [22]:
#pre-processing
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\n", "", string)  
    string = re.sub(r"\\n", "", string) 
    string = re.sub(r"\r", "", string) 
    string = re.sub(r"[0-9]", "digit", string)
    string = re.sub(r"\'", "", string)    
    string = re.sub(r"\"", "", string) 
    string = re.sub(r"\\'", "", string)    
    string = re.sub(r"\\", "", string) 
    string = re.sub(r"--", "", string)
    return string.strip().lower()


X = []
for i in range(enron_final.shape[0]):
    X.append(clean_str(enron_final.iloc[i][0]))
y = np.array(enron_final["Label"])

enron_str = clean_str(str(enron_final["EmailContent"]))
enron_str = pd.DataFrame(X)
enron_str.columns = ["EmailContent"]

In [28]:
#pipeline of feature engineering and model
model = Pipeline([("vectorizer", CountVectorizer()), ("tfidf", TfidfTransformer()),
                  ("linearsvcO", OneVsRestClassifier(LinearSVC(multi_class="ovr")))])
#the class_weight="balanced" option tries to remove the biasedness of model towards majority sample

#paramater selection
parameters = {'vectorizer__ngram_range': [(1, 1), (1, 2),(2,2)],
               'tfidf__use_idf': (True, False)}
gs_clf_svm = GridSearchCV(model, parameters, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X[:10000], y[:10000])
print("linearsvcO: ", gs_clf_svm.best_score_)
print("linearsvcO params: ", gs_clf_svm.best_params_)

linearsvcO:  0.6917
linearsvcO params:  {'tfidf__use_idf': True, 'vectorizer__ngram_range': (1, 1)}


In [29]:
#pipeline of feature engineering and model
model = Pipeline([("vectorizer", CountVectorizer()), ("tfidf", TfidfTransformer()),
                  ("BERN", OneVsRestClassifier(BernoulliNB()))])
#the class_weight="balanced" option tries to remove the biasedness of model towards majority sample

#paramater selection
parameters = {'vectorizer__ngram_range': [(1, 1), (1, 2),(2,2)],
               'tfidf__use_idf': (True, False)}
gs_clf_svm = GridSearchCV(model, parameters, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X[:10000], y[:10000])
print("BERN: ", gs_clf_svm.best_score_)
print("BERN params: ", gs_clf_svm.best_params_)

BERN:  0.5341999999999999
BERN params:  {'tfidf__use_idf': True, 'vectorizer__ngram_range': (1, 1)}


In [30]:
#pipeline of feature engineering and model
model = Pipeline([("vectorizer", CountVectorizer()), ("tfidf", TfidfTransformer()),
                  ("DT", OneVsRestClassifier(DecisionTreeClassifier()))])
#the class_weight="balanced" option tries to remove the biasedness of model towards majority sample

#paramater selection
parameters = {'vectorizer__ngram_range': [(1, 1), (1, 2),(2,2)],
               'tfidf__use_idf': (True, False)}
gs_clf_svm = GridSearchCV(model, parameters, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X[:10000], y[:10000])
print("DT: ", gs_clf_svm.best_score_)
print("DT params: ", gs_clf_svm.best_params_)

DT:  0.7253000000000001
DT params:  {'tfidf__use_idf': False, 'vectorizer__ngram_range': (1, 1)}


In [31]:
#pipeline of feature engineering and model
model = Pipeline([("vectorizer", CountVectorizer()), ("tfidf", TfidfTransformer()),
                  ("ET", OneVsRestClassifier(ExtraTreeClassifier()))])
#the class_weight="balanced" option tries to remove the biasedness of model towards majority sample

#paramater selection
parameters = {'vectorizer__ngram_range': [(1, 1), (1, 2),(2,2)],
               'tfidf__use_idf': (True, False)}
gs_clf_svm = GridSearchCV(model, parameters, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X[:10000], y[:10000])
print("ET: ", gs_clf_svm.best_score_)
print("ET params: ", gs_clf_svm.best_params_)

ET:  0.6022000000000001
ET params:  {'tfidf__use_idf': True, 'vectorizer__ngram_range': (1, 2)}


In [32]:
#pipeline of feature engineering and model
model = Pipeline([("vectorizer", CountVectorizer()), ("tfidf", TfidfTransformer()),
                  ("ETS", OneVsRestClassifier(ExtraTreesClassifier()))])
#the class_weight="balanced" option tries to remove the biasedness of model towards majority sample

#paramater selection
parameters = {'vectorizer__ngram_range': [(1, 1), (1, 2),(2,2)],
               'tfidf__use_idf': (True, False)}
gs_clf_svm = GridSearchCV(model, parameters, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X[:10000], y[:10000])
print("ETS: ", gs_clf_svm.best_score_)
print("ETS params: ", gs_clf_svm.best_params_)

ETS:  0.6967
ETS params:  {'tfidf__use_idf': True, 'vectorizer__ngram_range': (1, 1)}


In [34]:
#pipeline of feature engineering and model
model = Pipeline([("vectorizer", CountVectorizer()), ("tfidf", TfidfTransformer()),
                  ("KNN", OneVsRestClassifier(KNeighborsClassifier()))])
#the class_weight="balanced" option tries to remove the biasedness of model towards majority sample

#paramater selection
parameters = {'vectorizer__ngram_range': [(1, 1), (1, 2),(2,2)],
               'tfidf__use_idf': (True, False)}
gs_clf_svm = GridSearchCV(model, parameters, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X[:10000], y[:10000])
print("KNN: ", gs_clf_svm.best_score_)
print("KNN params: ", gs_clf_svm.best_params_)

KNN:  0.4542
KNN params:  {'tfidf__use_idf': True, 'vectorizer__ngram_range': (1, 1)}


In [38]:
#pipeline of feature engineering and model
model = Pipeline([("vectorizer", CountVectorizer()), ("tfidf", TfidfTransformer()),
                  ("linearsvc", OneVsRestClassifier(LinearSVC(multi_class="crammer_singer")))])
#the class_weight="balanced" option tries to remove the biasedness of model towards majority sample

#paramater selection
parameters = {'vectorizer__ngram_range': [(1, 1), (1, 2),(2,2)],
               'tfidf__use_idf': (True, False)}
gs_clf_svm = GridSearchCV(model, parameters, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X[:10000], y[:10000])
print("linearsvc: ", gs_clf_svm.best_score_)
print("linearsvc params: ", gs_clf_svm.best_params_)


Liblinear failed to converge, increase the number of iterations.



linearsvc:  0.6788000000000001
linearsvc params:  {'tfidf__use_idf': False, 'vectorizer__ngram_range': (1, 2)}


In [39]:
#pipeline of feature engineering and model
model = Pipeline([("vectorizer", CountVectorizer()), ("tfidf", TfidfTransformer()),
                  ("LRM", OneVsRestClassifier(LogisticRegression(multi_class="multinomial")))])
#the class_weight="balanced" option tries to remove the biasedness of model towards majority sample

#paramater selection
parameters = {'vectorizer__ngram_range': [(1, 1), (1, 2),(2,2)],
               'tfidf__use_idf': (True, False)}
gs_clf_svm = GridSearchCV(model, parameters, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X[:10000], y[:10000])
print("LRM: ", gs_clf_svm.best_score_)
print("LRM params: ", gs_clf_svm.best_params_)

LRM:  0.6593
LRM params:  {'tfidf__use_idf': True, 'vectorizer__ngram_range': (1, 1)}


In [None]:
#pipeline of feature engineering and model
model = Pipeline([("vectorizer", CountVectorizer()), ("tfidf", TfidfTransformer()),
                  ("RF", OneVsRestClassifier(RandomForestClassifier()))])
#the class_weight="balanced" option tries to remove the biasedness of model towards majority sample

#paramater selection
parameters = {'vectorizer__ngram_range': [(1, 1), (1, 2),(2,2)],
               'tfidf__use_idf': (True, False)}
gs_clf_svm = GridSearchCV(model, parameters, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X[:10000], y[:10000])
print("RF: ", gs_clf_svm.best_score_)
print("RF params: ", gs_clf_svm.best_params_)

In [None]:
#pipeline of feature engineering and model
model = Pipeline([("vectorizer", CountVectorizer()), ("tfidf", TfidfTransformer()),
                  ("RC", OneVsRestClassifier(RidgeClassifier()))])
#the class_weight="balanced" option tries to remove the biasedness of model towards majority sample

#paramater selection
parameters = {'vectorizer__ngram_range': [(1, 1), (1, 2),(2,2)],
               'tfidf__use_idf': (True, False)}
gs_clf_svm = GridSearchCV(model, parameters, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X[:10000], y[:10000])
print("RC: ", gs_clf_svm.best_score_)
print("RC params: ", gs_clf_svm.best_params_)

In [None]:
#pipeline of feature engineering and model
model = Pipeline([("vectorizer", CountVectorizer()), ("tfidf", TfidfTransformer()),
                  ("RCCV", OneVsRestClassifier(RidgeClassifierCV()))])
#the class_weight="balanced" option tries to remove the biasedness of model towards majority sample

#paramater selection
parameters = {'vectorizer__ngram_range': [(1, 1), (1, 2),(2,2)],
               'tfidf__use_idf': (True, False)}
gs_clf_svm = GridSearchCV(model, parameters, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X[:10000], y[:10000])
print("RCCV: ", gs_clf_svm.best_score_)
print("RCCV params: ", gs_clf_svm.best_params_)

In [None]:
#pipeline of feature engineering and model
model = Pipeline([("vectorizer", CountVectorizer()), ("tfidf", TfidfTransformer()),
                  ("SVC", OneVsRestClassifier(SVC()))])
#the class_weight="balanced" option tries to remove the biasedness of model towards majority sample

#paramater selection
parameters = {'vectorizer__ngram_range': [(1, 1), (1, 2),(2,2)],
               'tfidf__use_idf': (True, False)}
gs_clf_svm = GridSearchCV(model, parameters, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X[:10000], y[:10000])
print("SVC: ", gs_clf_svm.best_score_)
print("SVC params: ", gs_clf_svm.best_params_)

In [None]:
#pipeline of feature engineering and model
model = Pipeline([("vectorizer", CountVectorizer()), ("tfidf", TfidfTransformer()),
                  ("GBC", OneVsRestClassifier(GradientBoostingClassifier()))]) #XGBOOSTING
#the class_weight="balanced" option tries to remove the biasedness of model towards majority sample

#paramater selection
parameters = {'vectorizer__ngram_range': [(1, 1), (1, 2),(2,2)],
               'tfidf__use_idf': (True, False)}
gs_clf_svm = GridSearchCV(model, parameters, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X[:10000], y[:10000])
print("GBC: ", gs_clf_svm.best_score_)
print("GBC params: ", gs_clf_svm.best_params_)

In [None]:
#pipeline of feature engineering and model
model = Pipeline([("vectorizer", CountVectorizer()), ("tfidf", TfidfTransformer()),
                  ("LRO", OneVsRestClassifier(LogisticRegression(multi_class="ovr")))])
#the class_weight="balanced" option tries to remove the biasedness of model towards majority sample

#paramater selection
parameters = {'vectorizer__ngram_range': [(1, 1), (1, 2),(2,2)],
               'tfidf__use_idf': (True, False)}
gs_clf_svm = GridSearchCV(model, parameters, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X[:10000], y[:10000])
print("LRO: ", gs_clf_svm.best_score_)
print("LRO params: ", gs_clf_svm.best_params_)

In [None]:
#pipeline of feature engineering and model
model = Pipeline([("vectorizer", CountVectorizer()), ("tfidf", TfidfTransformer()),
                  ("SGDC", OneVsRestClassifier(SGDClassifier()))])
#the class_weight="balanced" option tries to remove the biasedness of model towards majority sample

#paramater selection
parameters = {'vectorizer__ngram_range': [(1, 1), (1, 2),(2,2)],
               'tfidf__use_idf': (True, False)}
gs_clf_svm = GridSearchCV(model, parameters, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X[:10000], y[:10000])
print("SGDC: ", gs_clf_svm.best_score_)
print("SGDC params: ", gs_clf_svm.best_params_)

In [None]:
#pipeline of feature engineering and model
model = Pipeline([("vectorizer", CountVectorizer()), ("tfidf", TfidfTransformer()),
                  ("PER", OneVsRestClassifier(Perceptron()))])
#the class_weight="balanced" option tries to remove the biasedness of model towards majority sample

#paramater selection
parameters = {'vectorizer__ngram_range': [(1, 1), (1, 2),(2,2)],
               'tfidf__use_idf': (True, False)}
gs_clf_svm = GridSearchCV(model, parameters, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X[:10000], y[:10000])
print("PER: ", gs_clf_svm.best_score_)
print("PER params: ", gs_clf_svm.best_params_)

In [None]:
#pipeline of feature engineering and model
model = Pipeline([("vectorizer", CountVectorizer()), ("tfidf", TfidfTransformer()),
                  ("PAC", OneVsRestClassifier(PassiveAggressiveClassifier()))])
#the class_weight="balanced" option tries to remove the biasedness of model towards majority sample

#paramater selection
parameters = {'vectorizer__ngram_range': [(1, 1), (1, 2),(2,2)],
               'tfidf__use_idf': (True, False)}
gs_clf_svm = GridSearchCV(model, parameters, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X[:10000], y[:10000])
print("PAC: ", gs_clf_svm.best_score_)
print("PAC params: ", gs_clf_svm.best_params_)