In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import os
import torch
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
USE_STEMEER = True
SEED = 42
QUICK = True
TEST_SIZE = 0.1

In [3]:
# seeding function for reproducibility
def seed_everything(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(SEED)

In [4]:
# Reading the csv file and removing unnecessary columns
df = pd.read_csv("../input/sentiment140/training.1600000.processed.noemoticon.csv",
                 encoding="latin1",
                 header=None)
df = df.rename(columns={0:"sentiment",
                        1:"id",
                        2:"time",
                        3:"query",
                        4:"username",
                        5:"text"})
df = df[["sentiment","text"]]
df["sentiment"] = df["sentiment"].map({0: 0, 4: 1})
df.head()

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [5]:
df.sentiment.value_counts()
# Looks like dataset is well balanced :)

1    800000
0    800000
Name: sentiment, dtype: int64

## Text Preprocessing

In [6]:
import re
from nltk.stem.porter import PorterStemmer

use_stemmer = USE_STEMEER
if use_stemmer:
      porter_stemmer = PorterStemmer()

def preprocess_word(word):
    # Remove punctuation
    word = word.strip('\'"?!,.():;')
    # Convert more than 2 letter repetitions to 2 letter
    # funnnnny --> funny
    word = re.sub(r'(.)\1+', r'\1\1', word)
    # Remove - & '
    word = re.sub(r'(-|\')', '', word)
    return word


def is_valid_word(word):
    # Check if word begins with an alphabet
    return (re.search(r'^[a-zA-Z][a-z0-9A-Z\._]*$', word) is not None)


def handle_emojis(tweet):
    # Smile -- :), : ), :-), (:, ( :, (-:, :')
    tweet = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\))', ' EMO_POS ', tweet)
    # Laugh -- :D, : D, :-D, xD, x-D, XD, X-D
    tweet = re.sub(r'(:\s?D|:-D|;\s?D|x-?D|X-?D)', ' EMO_POS ', tweet)
    # Love -- <3, :*
    tweet = re.sub(r'(<3|:\*)', ' EMO_POS ', tweet)
    # Wink -- ;-), ;), ;-D, ;D, (;,  (-;
    tweet = re.sub(r'(;-?\)|;-?D|\(-?;)', ' EMO_POS ', tweet)
    # Sad -- :-(, : (, :(, ):, )-:
    tweet = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:)', ' EMO_NEG ', tweet)
    # Cry -- :,(, :'(, :"(
    tweet = re.sub(r'(:,\(|:\'\(|:"\()', ' EMO_NEG ', tweet)
    return tweet


def preprocess_tweet(tweet):
    processed_tweet = []
    # Replaces URLs with the word URL
    tweet = re.sub(r'((www\.[\S]+)|(https?://[\S]+))', ' URL ', tweet)
    # Replace @handle with the word USER_MENTION
    tweet = re.sub(r'@[\S]+', 'USER_MENTION', tweet)
    # Replaces #hashtag with hashtag
    tweet = re.sub(r'#(\S+)', r' \1 ', tweet)
    # Remove RT (retweet)
    tweet = re.sub(r'\brt\b', '', tweet)
    # Replace 2+ dots with space
    tweet = re.sub(r'\.{2,}', ' ', tweet)
    # Strip space, " and ' from tweet
    tweet = tweet.strip(' "\'')
    # Replace emojis with either EMO_POS or EMO_NEG
    tweet = handle_emojis(tweet)
    # Replace multiple spaces with a single space
    tweet = re.sub(r'\s+', ' ', tweet)
    # Convert to lower case
    tweet = tweet.lower()
    
    words = tweet.split()
    for word in words:
        word = preprocess_word(word)
        if is_valid_word(word):
            if use_stemmer:
                word = str(porter_stemmer.stem(word))
        processed_tweet.append(word)
    return ' '.join(processed_tweet)

In [7]:
# Example output
print(df.text[0])
print(preprocess_tweet(df.text[0]))

@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D
user_ment url  aww that a bummer you shoulda got david carr of third day to do it emo_po


In [8]:
%%time
df['Processed_text'] = df.text.apply(preprocess_tweet)

CPU times: user 16min 28s, sys: 1.64 s, total: 16min 30s
Wall time: 16min 30s


In [9]:
df.head()

Unnamed: 0,sentiment,text,Processed_text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",user_ment url aww that a bummer you shoulda g...
1,0,is upset that he can't update his Facebook by ...,is upset that he cant updat hi facebook by tex...
2,0,@Kenichan I dived many times for the ball. Man...,user_ment i dive mani time for the ball manag ...
3,0,my whole body feels itchy and like its on fire,my whole bodi feel itchi and like it on fire
4,0,"@nationwideclass no, it's not behaving at all....",user_ment no it not behav at all im mad whi am...


In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

def train_model(Xtrain, Xtest, ytrain, ytest, quick=QUICK):
    """
    quick Boolean: 
    """
    if quick:
        clf = LogisticRegression()
    else:
        parameters = {'C':[0.01, 0.1, 1, 5, 10, 100]}
        clf = LogisticRegression()
        grid_search = GridSearchCV(clf,
                                   parameters,
                                   cv=StratifiedKFold(n_splits=3,random_state=42)\
                                   .split(X_train, y_train),
                                   verbose=2,
                                   n_jobs=4)
        clf = grid_search.fit(X_train, y_train)
        clf = LogisticRegression(C=clf.best_params_['C'])
    clf.fit(X_train,y_train)
    y_preds = clf.predict(X_test)
    probability = clf.predict_proba(X_test)[:,1]
    print(f"Accuracy is {accuracy_score(y_test, y_preds)} AUC is {roc_auc_score(y_test, probability)}")
    print(classification_report(y_test, y_preds))
    return clf

In [11]:
def print_top_features(coefs, my_map,  K=10):
    a = [my_map[i] for i in coefs.argsort()[-K:][::-1]]
    b = [coefs[i] for i in coefs.argsort()[-K:][::-1]]
    c = [my_map[i] for i in coefs.argsort()[:K]]
    d = [coefs[i] for i in coefs.argsort()[:K]]
    print(f"{'Positive_features': <20} {'coef': <5} \t\t {'negative_features': <20} {'coef': <5}")
    print("-"*70)
    for i,j,k,l in zip(a,b,c,d):
        print(f"{i: <20} {j:.4f} \t\t {k: <20} {l:.4f}")

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

In [13]:
X__train, X__test, y_train, y_test=train_test_split(df['Processed_text'],
                                              df['sentiment'],
                                              test_size=TEST_SIZE,
                                              shuffle=True)

## Count features

In [14]:
%%time
vectorizer = CountVectorizer(stop_words="english")
vectorizer.fit(X__train)
X_train = vectorizer.transform(X__train)
X_test = vectorizer.transform(X__test)
my_map ={v: k for k, v in vectorizer.vocabulary_.items()}
print(f"number of features = {X_train.shape[1]}")

number of features = 234111
CPU times: user 1min 11s, sys: 1.19 s, total: 1min 13s
Wall time: 1min 13s


In [15]:
%%time
clf = train_model(X_train, X_test, y_train, y_test)
print_top_features(clf.coef_.copy()[0], my_map)

Accuracy is 0.77755 AUC is 0.8524289583982977
              precision    recall  f1-score   support

           0       0.79      0.76      0.77     79812
           1       0.77      0.80      0.78     80188

    accuracy                           0.78    160000
   macro avg       0.78      0.78      0.78    160000
weighted avg       0.78      0.78      0.78    160000

Positive_features    coef  		 negative_features    coef 
----------------------------------------------------------------------
fuzzbal              2.5672 		 nanda                -3.7418
excitedd             2.5044 		 inaperfectworld      -3.7023
myfax                2.4921 		 sadd                 -3.6137
emailunlimit         2.4684 		 sadden               -3.4210
smilin               2.4482 		 dontyouh             -3.3968
mahasha              2.3457 		 pakcricket           -3.2005
lml                  2.2920 		 cries                -3.1300
happyy               2.2752 		 sad                  -3.1136
goodsex            

## Limited Count Features

In [16]:
%%time
vectorizer = CountVectorizer(stop_words="english", max_features=10000)
vectorizer.fit(X__train)
X_train = vectorizer.transform(X__train)
X_test = vectorizer.transform(X__test)
my_map ={v: k for k, v in vectorizer.vocabulary_.items()}

CPU times: user 1min 11s, sys: 78 ms, total: 1min 11s
Wall time: 1min 11s


In [17]:
%%time
clf = train_model(X_train, X_test, y_train, y_test)
print_top_features(clf.coef_.copy()[0], my_map)

Accuracy is 0.7765125 AUC is 0.8515266046650493
              precision    recall  f1-score   support

           0       0.79      0.75      0.77     79812
           1       0.76      0.80      0.78     80188

    accuracy                           0.78    160000
   macro avg       0.78      0.78      0.78    160000
weighted avg       0.78      0.78      0.78    160000

Positive_features    coef  		 negative_features    coef 
----------------------------------------------------------------------
fuzzbal              2.5819 		 inaperfectworld      -3.7611
excitedd             2.4598 		 sadd                 -3.5161
emailunlimit         2.4123 		 dontyouh             -3.4283
happyy               2.2168 		 sadden               -3.3644
goodsex              2.1868 		 pakcricket           -3.2178
reliev               2.0177 		 cries                -3.1333
musicmonday          1.9592 		 sad                  -3.0907
smile                1.9448 		 sadfac               -2.9900
congratul        

## Hashing Vectorizer

In [18]:
%%time
vectorizer = HashingVectorizer(stop_words="english")#, n_features=2**20)
vectorizer.fit(X__train)
X_train = vectorizer.transform(X__train)
X_test = vectorizer.transform(X__test)

CPU times: user 30.9 s, sys: 13 ms, total: 30.9 s
Wall time: 30.9 s


In [19]:
%%time
clf = train_model(X_train, X_test, y_train, y_test)

Accuracy is 0.779475 AUC is 0.8602496972133279
              precision    recall  f1-score   support

           0       0.79      0.76      0.77     79812
           1       0.77      0.80      0.78     80188

    accuracy                           0.78    160000
   macro avg       0.78      0.78      0.78    160000
weighted avg       0.78      0.78      0.78    160000

CPU times: user 1min 34s, sys: 146 ms, total: 1min 35s
Wall time: 1min 28s


# Bigrams

In [20]:
%%time
vectorizer = CountVectorizer(stop_words="english", ngram_range=(1, 2))
vectorizer.fit(X__train)
X_train = vectorizer.transform(X__train)
X_test = vectorizer.transform(X__test)
my_map ={v: k for k, v in vectorizer.vocabulary_.items()}
print(f"number of features = {X_train.shape[1]}")

number of features = 3413468
CPU times: user 2min 42s, sys: 3.5 s, total: 2min 45s
Wall time: 2min 45s


In [21]:
%%time
clf = train_model(X_train, X_test, y_train, y_test)
print_top_features(clf.coef_.copy()[0], my_map)



Accuracy is 0.79118125 AUC is 0.8674078095565033
              precision    recall  f1-score   support

           0       0.80      0.77      0.79     79812
           1       0.78      0.81      0.80     80188

    accuracy                           0.79    160000
   macro avg       0.79      0.79      0.79    160000
weighted avg       0.79      0.79      0.79    160000

Positive_features    coef  		 negative_features    coef 
----------------------------------------------------------------------
wish luck            3.8954 		 sad                  -3.6296
doesnt hurt          3.5806 		 sadd                 -3.6009
wont hurt            3.4848 		 inaperfectworld      -3.5761
sad sad              3.2333 		 sadli                -3.4834
wont disappoint      3.2270 		 pass away            -3.2902
aint bad             3.2125 		 sadden               -3.2625
isnt bad             3.1350 		 cries                -3.2094
noth wrong           3.0072 		 dontyouh             -3.1184
sorri delay     

In [22]:
%%time
vectorizer = CountVectorizer(stop_words="english", ngram_range=(1, 2), max_features=10000)
vectorizer.fit(X__train)
X_train = vectorizer.transform(X__train)
X_test = vectorizer.transform(X__test)
my_map ={v: k for k, v in vectorizer.vocabulary_.items()}
print(f"number of features = {X_train.shape[1]}")

number of features = 10000
CPU times: user 2min 31s, sys: 1.71 s, total: 2min 33s
Wall time: 2min 33s


In [23]:
%%time
clf = train_model(X_train, X_test, y_train, y_test)
print_top_features(clf.coef_.copy()[0], my_map)

Accuracy is 0.7813875 AUC is 0.8583408145621485
              precision    recall  f1-score   support

           0       0.80      0.75      0.77     79812
           1       0.77      0.81      0.79     80188

    accuracy                           0.78    160000
   macro avg       0.78      0.78      0.78    160000
weighted avg       0.78      0.78      0.78    160000

Positive_features    coef  		 negative_features    coef 
----------------------------------------------------------------------
wish luck            3.3646 		 inaperfectworld      -3.7768
sad sad              2.9326 		 sadd                 -3.4387
isnt bad             2.8755 		 sadden               -3.2612
dont miss            2.6319 		 cries                -3.1073
wasnt bad            2.6243 		 sad                  -3.0763
noth wrong           2.5804 		 sadli                -3.0618
dont sad             2.4157 		 pass away            -2.9815
dont worri           2.1909 		 lost pleas           -2.9784
whi thank        

In [24]:
%%time
vectorizer = CountVectorizer(stop_words="english", ngram_range=(1, 2) , max_features=100000)
vectorizer.fit(X__train)
X_train = vectorizer.transform(X__train)
X_test = vectorizer.transform(X__test)
my_map ={v: k for k, v in vectorizer.vocabulary_.items()}
print(f"number of features = {X_train.shape[1]}")

number of features = 100000
CPU times: user 2min 33s, sys: 1.42 s, total: 2min 34s
Wall time: 2min 34s


In [25]:
%%time
clf = train_model(X_train, X_test, y_train, y_test)
print_top_features(clf.coef_.copy()[0], my_map)

Accuracy is 0.78826875 AUC is 0.864436685570346
              precision    recall  f1-score   support

           0       0.80      0.77      0.78     79812
           1       0.78      0.81      0.79     80188

    accuracy                           0.79    160000
   macro avg       0.79      0.79      0.79    160000
weighted avg       0.79      0.79      0.79    160000

Positive_features    coef  		 negative_features    coef 
----------------------------------------------------------------------
doesnt hurt          3.6514 		 inaperfectworld      -3.7542
wish luck            3.6408 		 sadd                 -3.3917
wont disappoint      3.3058 		 dontyouh             -3.3754
wont hurt            3.2642 		 sad                  -3.3653
sad sad              3.1557 		 pic reason           -3.3361
aint bad             3.0752 		 cries                -3.2535
wont miss            2.9695 		 pakcricket           -3.2516
didnt hurt           2.9296 		 sadden               -3.2473
didnt miss       

In [26]:
%%time
vectorizer = HashingVectorizer(stop_words="english",  ngram_range=(1, 2))#, n_features=2**20)
vectorizer.fit(X__train)
X_train = vectorizer.transform(X__train)
X_test = vectorizer.transform(X__test)

CPU times: user 45.8 s, sys: 452 ms, total: 46.3 s
Wall time: 46.3 s


In [27]:
%%time
clf = train_model(X_train, X_test, y_train, y_test)

Accuracy is 0.7914875 AUC is 0.8736302615762446
              precision    recall  f1-score   support

           0       0.80      0.77      0.79     79812
           1       0.78      0.81      0.80     80188

    accuracy                           0.79    160000
   macro avg       0.79      0.79      0.79    160000
weighted avg       0.79      0.79      0.79    160000

CPU times: user 2min 18s, sys: 113 ms, total: 2min 18s
Wall time: 2min 12s


In [28]:
%%time
vectorizer = HashingVectorizer(stop_words="english",  ngram_range=(1, 4))#, n_features=2**20)
vectorizer.fit(X__train)
X_train = vectorizer.transform(X__train)
X_test = vectorizer.transform(X__test)

CPU times: user 1min 5s, sys: 1.81 s, total: 1min 7s
Wall time: 1min 7s


In [29]:
%%time
clf = train_model(X_train, X_test, y_train, y_test)

Accuracy is 0.78754375 AUC is 0.8701233515843341
              precision    recall  f1-score   support

           0       0.80      0.77      0.78     79812
           1       0.78      0.81      0.79     80188

    accuracy                           0.79    160000
   macro avg       0.79      0.79      0.79    160000
weighted avg       0.79      0.79      0.79    160000

CPU times: user 3min 18s, sys: 304 ms, total: 3min 19s
Wall time: 3min 13s


## TF-IDF

In [30]:
%%time
vectorizer = TfidfVectorizer(stop_words="english",
                             ngram_range=(1, 1),
                             max_df=0.5, 
                             min_df=5, 
                             max_features=None,
                             smooth_idf=True,
                             sublinear_tf=False)
vectorizer.fit(X__train)
X_train = vectorizer.transform(X__train)
X_test = vectorizer.transform(X__test)
my_map ={v: k for k, v in vectorizer.vocabulary_.items()}

CPU times: user 1min 12s, sys: 58 ms, total: 1min 12s
Wall time: 1min 12s


In [31]:
%%time
clf = train_model(X_train, X_test, y_train, y_test)
print_top_features(clf.coef_.copy()[0], my_map)

Accuracy is 0.77775625 AUC is 0.8581793905456843
              precision    recall  f1-score   support

           0       0.79      0.76      0.77     79812
           1       0.77      0.80      0.78     80188

    accuracy                           0.78    160000
   macro avg       0.78      0.78      0.78    160000
weighted avg       0.78      0.78      0.78    160000

Positive_features    coef  		 negative_features    coef 
----------------------------------------------------------------------
thank                5.0583 		 sad                  -10.6170
smile                4.5882 		 miss                 -6.9778
welcom               4.3479 		 sadli                -6.9738
proud                4.0340 		 poor                 -6.5698
congratul            3.9762 		 unfortun             -6.0787
glad                 3.9327 		 sick                 -5.7863
reliev               3.7688 		 depress              -5.6400
pleasur              3.5938 		 sadden               -5.5915
awesom         

In [32]:
%%time
vectorizer = TfidfVectorizer(stop_words="english",
                             ngram_range=(1, 1),
                             max_df=0.5, 
                             min_df=5, 
                             max_features=10000,
                             smooth_idf=True,
                             sublinear_tf=False)
vectorizer.fit(X__train)
X_train = vectorizer.transform(X__train)
X_test = vectorizer.transform(X__test)
my_map ={v: k for k, v in vectorizer.vocabulary_.items()}

CPU times: user 1min 12s, sys: 24.1 ms, total: 1min 12s
Wall time: 1min 12s


In [33]:
%%time
clf = train_model(X_train, X_test, y_train, y_test)
print_top_features(clf.coef_.copy()[0], my_map)

Accuracy is 0.7754 AUC is 0.8559772699469734
              precision    recall  f1-score   support

           0       0.79      0.75      0.77     79812
           1       0.77      0.80      0.78     80188

    accuracy                           0.78    160000
   macro avg       0.78      0.78      0.78    160000
weighted avg       0.78      0.78      0.78    160000

Positive_features    coef  		 negative_features    coef 
----------------------------------------------------------------------
thank                4.8635 		 sad                  -10.1334
smile                4.4192 		 sadli                -6.6699
welcom               4.2110 		 miss                 -6.5492
proud                3.8456 		 poor                 -6.1955
congratul            3.8310 		 unfortun             -5.8539
glad                 3.7437 		 sick                 -5.5586
reliev               3.7171 		 sadden               -5.4078
pleasur              3.5089 		 depress              -5.3979
awesom             

In [34]:
%%time
vectorizer = TfidfVectorizer(stop_words="english",
                             ngram_range=(1, 2),
                             max_df=0.5, 
                             min_df=5, 
                             max_features=None,
                             smooth_idf=True,
                             sublinear_tf=False)
vectorizer.fit(X__train)
X_train = vectorizer.transform(X__train)
X_test = vectorizer.transform(X__test)
my_map ={v: k for k, v in vectorizer.vocabulary_.items()}

CPU times: user 2min 37s, sys: 1.31 s, total: 2min 39s
Wall time: 2min 39s


In [35]:
%%time
clf = train_model(X_train, X_test, y_train, y_test)
print_top_features(clf.coef_.copy()[0], my_map)

Accuracy is 0.79410625 AUC is 0.8761462433613789
              precision    recall  f1-score   support

           0       0.81      0.77      0.79     79812
           1       0.78      0.81      0.80     80188

    accuracy                           0.79    160000
   macro avg       0.79      0.79      0.79    160000
weighted avg       0.79      0.79      0.79    160000

Positive_features    coef  		 negative_features    coef 
----------------------------------------------------------------------
wish luck            8.7819 		 sad                  -16.1624
thank                7.9070 		 miss                 -11.2604
dont worri           7.0560 		 sadli                -9.9359
smile                6.5034 		 poor                 -9.6310
isnt bad             6.2690 		 unfortun             -8.9120
welcom               6.0191 		 wish                 -8.8630
wasnt bad            5.9651 		 sick                 -8.6440
dont forget          5.8576 		 hurt                 -8.1435
doesnt hurt   

In [36]:
%%time
vectorizer = TfidfVectorizer(stop_words="english",
                             ngram_range=(1, 2),
                             max_df=0.5, 
                             min_df=5, 
                             max_features=10000,
                             smooth_idf=True,
                             sublinear_tf=False)
vectorizer.fit(X__train)
X_train = vectorizer.transform(X__train)
X_test = vectorizer.transform(X__test)
my_map ={v: k for k, v in vectorizer.vocabulary_.items()}

CPU times: user 2min 31s, sys: 890 ms, total: 2min 32s
Wall time: 2min 32s


In [37]:
%%time
clf = train_model(X_train, X_test, y_train, y_test)
print_top_features(clf.coef_.copy()[0], my_map)

Accuracy is 0.78054375 AUC is 0.8632507273021415
              precision    recall  f1-score   support

           0       0.79      0.76      0.77     79812
           1       0.77      0.80      0.79     80188

    accuracy                           0.78    160000
   macro avg       0.78      0.78      0.78    160000
weighted avg       0.78      0.78      0.78    160000

Positive_features    coef  		 negative_features    coef 
----------------------------------------------------------------------
wish luck            6.6818 		 sad                  -11.3654
thank                5.5114 		 sadli                -7.6178
dont worri           5.3711 		 miss                 -7.4964
isnt bad             5.0034 		 poor                 -6.8534
wasnt bad            4.8235 		 unfortun             -6.7797
smile                4.7964 		 sick                 -5.9647
dont forget          4.5864 		 wish                 -5.9476
welcom               4.3645 		 pass away            -5.8808
noth wrong     

In [38]:
%%time
from scipy.sparse import hstack
vectorizer = TfidfVectorizer(stop_words="english",
                             ngram_range=(1, 2),
                             max_df=0.5, 
                             min_df=5, 
                             max_features=10000,
                             smooth_idf=True,
                             sublinear_tf=False)
vectorizer.fit(X__train)
X_train = vectorizer.transform(X__train)
X_test = vectorizer.transform(X__test)

vectorizer2 = CountVectorizer(stop_words="english",
                              ngram_range=(1, 2) ,
                              max_features=10000)
vectorizer2.fit(X__train)
X_train2 = vectorizer.transform(X__train)
X_test2 = vectorizer.transform(X__test)
X_train = hstack((X_train, X_train2))
X_test = hstack((X_test, X_test2))
print(f"number of features = {X_train.shape[1]}")

number of features = 20000
CPU times: user 5min 4s, sys: 2.51 s, total: 5min 6s
Wall time: 5min 6s


In [39]:
%%time
clf = train_model(X_train, X_test, y_train, y_test)

Accuracy is 0.78045625 AUC is 0.86319955092577
              precision    recall  f1-score   support

           0       0.79      0.76      0.77     79812
           1       0.77      0.81      0.79     80188

    accuracy                           0.78    160000
   macro avg       0.78      0.78      0.78    160000
weighted avg       0.78      0.78      0.78    160000

CPU times: user 1min 22s, sys: 274 ms, total: 1min 22s
Wall time: 1min 19s
