### library and data import

In [3]:
import pandas as pd
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

In [4]:
data = pd.read_csv('dataset/twitter_sentiments.csv')
data.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [5]:
train, test = train_test_split(data, test_size=0.2, stratify = data['label'], random_state=21)
train.shape, test.shape

((25569, 3), (6393, 3))

### Transforming the data

In [6]:
tfidf_vectorizer = TfidfVectorizer(lowercase=True, max_features=1000, stop_words=ENGLISH_STOP_WORDS)
tfidf_vectorizer.fit(train.tweet)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=1000,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True,
                stop_words=frozenset({'a', 'about', 'above', 'across', 'after',
                                      'afterwards', 'again', 'against', 'all',
                                      'almost', 'alone', 'along', 'already',
                                      'also', 'although', 'always', 'am',
                                      'among', 'amongst', 'amoungst', 'amount',
                                      'an', 'and', 'another', 'any', 'anyhow',
                                      'anyone', 'anything', 'anyway',
                                      'anywhere', ...}),
                strip_accents=None, sublinear_tf=False,
                token_pa

In [7]:
train.tweet[:5]

11221     i am trusting. #i_am #positive #affirmation     
8977     trump vs clinton...smdh...america really is a ...
26800    my ð&amp;ð go out 2the victims &amp;fami...
28522    #rainbowrowell   bull up: you will dominate yo...
5811      @user @user just ordered two spf15 with bronz...
Name: tweet, dtype: object

In [8]:
test.tweet[:5]

11420      13 days!!! #love #life   #reunited   #countdown
12226    @user ðpathetic, selfish &amp; disrespectfu...
22901    thomas always says i live in a dream world, no...
16449    @user #allahsoil one infamous maneuver has aff...
2769                         #fashion it is a true   #fact
Name: tweet, dtype: object

In [10]:
train_idf = tfidf_vectorizer.transform(train.tweet)
test_idf = tfidf_vectorizer.transform(test.tweet)

In [12]:
train_idf[:5]


<5x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 24 stored elements in Compressed Sparse Row format>

In [14]:
for i in test_idf[:5]:
    print(i)
    
# not sure what transformation has been done here!

  (0, 517)	0.2669683265136178
  (0, 486)	0.3346351233884891
  (0, 193)	0.39814015309536527
  (0, 172)	0.5726445526063562
  (0, 6)	0.5747259187619383
  (0, 905)	0.14683231106480715
  (0, 864)	0.31338905906365044
  (0, 852)	0.3844551818611087
  (0, 626)	0.5112891418427852
  (0, 459)	0.4359944648781471
  (0, 170)	0.44639030369750854
  (0, 47)	0.2857441513326945
  (0, 969)	0.24282020715777927
  (0, 729)	0.3039931644977952
  (0, 697)	0.2849774010518948
  (0, 498)	0.2525040490604587
  (0, 490)	0.20319983684558657
  (0, 486)	0.40210715005128433
  (0, 457)	0.19144252316264201
  (0, 227)	0.6260762866933708
  (0, 92)	0.26842875922891196
  (0, 905)	0.286646339841049
  (0, 41)	0.9580364689591571
  (0, 879)	0.5518808833510299
  (0, 275)	0.559699876165511
  (0, 264)	0.6181937715732815


### Applying logistic regression

In [17]:
model_LR = LogisticRegression()

model_LR.fit(train_idf, train.label)
predict_train = model_LR.predict(train_idf)
predict_test = model_LR.predict(test_idf)

print(f1_score(y_true=train.label, y_pred=predict_train))
f1_score(y_true=test.label, y_pred=predict_test)


0.48821414302836597


0.45751633986928114

In [18]:
pipeline = Pipeline(steps = [('tfidf', TfidfVectorizer(lowercase=True,
                                                       max_features=1000,
                                                       stop_words = ENGLISH_STOP_WORDS)),
                            ('model', LogisticRegression())])

pipeline.fit(train.tweet, train.label)



Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=1000,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=frozenset({'a', 'about', 'above',
                                                       'across', 'after',
                                                       'afterward...
                                 strip_accents=None, sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),


In [20]:
test = ["Virat Kohli, AB de Villiers set to auction their 'Green Day' kits from 2016 IPL match to raise funds"]
pipeline.predict(test)

array([0], dtype=int64)

### Dumping the model using the joblib library

In [22]:
from joblib import dump

dump(pipeline, filename="text_classification.joblib")

['text_classification.joblib']

### Using the saved model

In [26]:
from joblib import load

text = ["Love in time of cholera was a racist book."]

pipeline = load("text_classification.joblib")

pipeline.predict(text)

array([0], dtype=int64)