In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_train.csv
/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_test.csv


# Corona Tweets NLP

## Introduction

The objective of this project is to perform Text Classification on the data. The tweets have been pulled from Twitter and manual tagging has been done then. The objective is to train a model to do sentiment classification of the sentences.

## Preprocessing

In [2]:
data = pd.read_csv('/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_train.csv', encoding='ISO-8859-1')

In [3]:
data.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


I will drop some columns because I think have no meaning to the prediction, the columns are:
 - UserName
 - ScreenName
 - Location
 - TweetAt

The columns I will use are:
 - OriginalTweet as documents
 - Sentiment as label

In [4]:
documents = data['OriginalTweet']
labels = data['Sentiment']

## Modeling

In [5]:
from spacy.lang.en import STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline

Because this is a classification kind of problem, I will try to test which classification model would perform the best. Here are some models I consider:
- MultinomialNB
- KNeighborsClassifier
- LinearSVC

In [6]:
Xt = TfidfVectorizer(stop_words=STOP_WORDS).fit_transform(documents)

  'stop_words.' % sorted(inconsistent))


In [7]:
estimators = {
    'MultinomialNB': MultinomialNB(),
    'KNN': KNeighborsClassifier(),
    'SVC': LinearSVC()
}

for name, est in estimators.items():
    kfold = KFold(n_splits=10, shuffle=True, random_state=100)
    scores = cross_val_score(estimator=est, X=Xt, y=labels, cv=kfold, scoring='f1_macro')
    print('{}: {:.3f}'.format(name, scores.mean()))

MultinomialNB: 0.222
KNN: 0.346
SVC: 0.568


The observation shows that SVC performs the best. I will do the hyperparameter tuning in the next section.

### Fitting Model

In [8]:
from tempfile import mkdtemp

In [9]:
cachedir = mkdtemp()
est = Pipeline([
    ('vectorize', TfidfVectorizer(stop_words=STOP_WORDS)),
    ('classifier', GridSearchCV(LinearSVC(), param_grid={'C': np.linspace(1, 3, 10)},
                               n_jobs=-1, cv=5, verbose=1))
], memory=cachedir)
est.fit(documents, labels)

  'stop_words.' % sorted(inconsistent))


Fitting 5 folds for each of 10 candidates, totalling 50 fits


If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.2min finished


Pipeline(memory='/tmp/tmpmq00sbxy',
         steps=[('vectorize',
                 TfidfVectorizer(stop_words={"'d", "'ll", "'m", "'re", "'s",
                                             "'ve", 'a', 'about', 'above',
                                             'across', 'after', 'afterwards',
                                             'again', 'against', 'all',
                                             'almost', 'alone', 'along',
                                             'already', 'also', 'although',
                                             'always', 'am', 'among', 'amongst',
                                             'amount', 'an', 'and', 'another',
                                             'any', ...})),
                ('classifier',
                 GridSearchCV(cv=5, estimator=LinearSVC(), n_jobs=-1,
                              param_grid={'C': array([1.        , 1.22222222, 1.44444444, 1.66666667, 1.88888889,
       2.11111111, 2.33333333, 2.55555556, 2.777

In [10]:
est.named_steps['classifier'].best_params_

{'C': 1.0}

In [11]:
est.score(documents, labels)

0.9671501810141653

## Evaluate the Model

In [12]:
test_data = pd.read_csv('/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_test.csv', encoding='ISO-8859-1')

In [13]:
test_data.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,1,44953,NYC,02-03-2020,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative
1,2,44954,"Seattle, WA",02-03-2020,When I couldn't find hand sanitizer at Fred Me...,Positive
2,3,44955,,02-03-2020,Find out how you can protect yourself and love...,Extremely Positive
3,4,44956,Chicagoland,02-03-2020,#Panic buying hits #NewYork City as anxious sh...,Negative
4,5,44957,"Melbourne, Victoria",03-03-2020,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral


The test data has a very similar structure as train data.

In [14]:
test_documents = test_data['OriginalTweet']
test_labels = test_data['Sentiment']

In [15]:
est.score(test_documents, test_labels)

0.5339652448657188