Import required libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score,f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report


import sys
import re
!pip install emoji --quiet
import emoji
!pip install contractions --quiet
import contractions
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import unicodedata

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Dataset

In [2]:
train= pd.read_csv("/content/drive/My Drive/COVID19Tweet-master/TRAIN_WNUT.csv")
valid=pd.read_csv("/content/drive/My Drive/COVID19Tweet-master/VALID_WNUT.csv")
total= pd.concat([train,valid],ignore_index=True)
mix = total.iloc[:,1:]
mix

Unnamed: 0,Id,Text,Label
0,1241490299215634434,Official death toll from #covid19 in the Unite...,INFORMATIVE
1,1245916400981381130,"Dearest Mr. President @USER 1,169 coronavirus ...",INFORMATIVE
2,1241132432402849793,Latest Updates March 20 ⚠️5274 new cases and 3...,INFORMATIVE
3,1236107253666607104,真把公主不当干部 BREAKING: 21 people on Grand Princess...,INFORMATIVE
4,1239673817552879619,OKLAHOMA CITY — The State Department of Educat...,UNINFORMATIVE
...,...,...,...
7995,1245955124222099456,Coronavirus took hold in UK earlier than thoug...,UNINFORMATIVE
7996,1241768801210904576,I talked with a man who is Rowan County’s seco...,INFORMATIVE
7997,1241172153040502795,Governor Wolf delaying enforcement of non-life...,UNINFORMATIVE
7998,1239740620194766848,The Sheriff's Department has reduced the jail ...,UNINFORMATIVE


## Data Cleaning

In [3]:
def cleaning(text):
  text= text.lower()
  text= emoji.demojize(text)
  text=contractions.fix(text)
  text=text.strip()
  text=text.replace('[^\w\s]','')
  text=re.sub(r'http\S+', '', text)
  REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
  BAD_SYMBOLS_RE = re.compile('[^0-9a-z +]')
  text = REPLACE_BY_SPACE_RE.sub(' ' , text)
  text = BAD_SYMBOLS_RE.sub(' ',text)
  
  return text

clean=mix['Text'].apply(cleaning)
STOPWORDS = set(stopwords.words('english'))

ff=[]
for i in clean:
  text=unicodedata.normalize('NFKD', i).encode('ascii', 'ignore').decode('utf-8', 'ignore')
  ff.append(text)
dd=pd.DataFrame(ff)
dataset = pd.concat([dd,mix['Label']],axis=1)
dataset

Unnamed: 0,0,Label
0,official death toll from covid19 in the unite...,INFORMATIVE
1,dearest mr president user 1 169 coronavirus ...,INFORMATIVE
2,latest updates march 20 warning selector 5274...,INFORMATIVE
3,breaking 21 people on grand princess...,INFORMATIVE
4,oklahoma city the state department of educat...,UNINFORMATIVE
...,...,...
7995,coronavirus took hold in uk earlier than thoug...,UNINFORMATIVE
7996,i talked with a man who is rowan county s seco...,INFORMATIVE
7997,governor wolf delaying enforcement of non life...,UNINFORMATIVE
7998,the sheriff s department has reduced the jail ...,UNINFORMATIVE


## Models for Conventional Approaches

In [5]:
regressor = LogisticRegression(solver='liblinear') 
S = svm.SVC()
N = BernoulliNB()
Rf = RandomForestClassifier(max_depth=8, random_state=0)
ANN = MLPClassifier(solver='lbfgs', alpha=1e-5,
                   hidden_layer_sizes=(5, 2), random_state=1)

## Count Vector

In [6]:
cv= CountVectorizer()
transform = cv.fit_transform(dataset[0].values.astype('U'))
x_train= transform[:len(train)]                                           #trainset 
x_valid= transform[len(train):]                                           #validset
print(x_train.shape)
print(x_valid.shape)

(7000, 22165)
(1000, 22165)


### Training the models

In [7]:
regressor.fit(x_train,dataset['Label'][:len(train)])
S.fit(x_train,dataset['Label'][:len(train)])
N.fit(x_train,dataset['Label'][:len(train)])
Rf.fit(x_train, dataset['Label'][:len(train)])
ANN.fit(x_train, dataset['Label'][:len(train)])

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(5, 2), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=1, shuffle=True, solver='lbfgs',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

### Prediction of labels for Valid dataset using Count Vector

In [8]:
y_pred1= regressor.predict(x_valid)
y_pred2= S.predict(x_valid)
y_pred3= N.predict(x_valid)
y_pred4= Rf.predict(x_valid)
y_pred5= ANN.predict(x_valid)

In [9]:
for i in range(5):
  ac=accuracy_score(dataset['Label'][len(train):], globals()['y_pred'+str(int(i+1))])
  print(ac)
  print(classification_report(dataset['Label'][len(train):], globals()['y_pred'+str(int(i+1))]))

0.804
               precision    recall  f1-score   support

  INFORMATIVE       0.82      0.75      0.78       472
UNINFORMATIVE       0.79      0.85      0.82       528

     accuracy                           0.80      1000
    macro avg       0.81      0.80      0.80      1000
 weighted avg       0.81      0.80      0.80      1000

0.806
               precision    recall  f1-score   support

  INFORMATIVE       0.84      0.73      0.78       472
UNINFORMATIVE       0.78      0.87      0.83       528

     accuracy                           0.81      1000
    macro avg       0.81      0.80      0.80      1000
 weighted avg       0.81      0.81      0.80      1000

0.785
               precision    recall  f1-score   support

  INFORMATIVE       0.77      0.78      0.77       472
UNINFORMATIVE       0.80      0.79      0.79       528

     accuracy                           0.79      1000
    macro avg       0.78      0.78      0.78      1000
 weighted avg       0.79      0.79     

## Tf-IDf

In [10]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(dataset[0])
X_train= X[:len(train)]
X_valid= X[len(train):]

In [11]:
regressor.fit(X_train,dataset['Label'][:len(train)])
S.fit(X_train,dataset['Label'][:len(train)])
N.fit(X_train,dataset['Label'][:len(train)])
Rf.fit(X_train, dataset['Label'][:len(train)])
ANN.fit(X_train, dataset['Label'][:len(train)])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(5, 2), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=1, shuffle=True, solver='lbfgs',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [12]:
Y_pred1= regressor.predict(X_valid)
Y_pred2= S.predict(X_valid)
Y_pred3= N.predict(X_valid)
Y_pred4= Rf.predict(X_valid)
Y_pred5= ANN.predict(X_valid)

In [13]:
for i in range(5):
  ac=accuracy_score(dataset['Label'][len(train):], globals()['Y_pred'+str(int(i+1))])
  print(ac)
  print(classification_report(dataset['Label'][len(train):], globals()['Y_pred'+str(int(i+1))]))

0.813
               precision    recall  f1-score   support

  INFORMATIVE       0.86      0.72      0.78       472
UNINFORMATIVE       0.78      0.90      0.84       528

     accuracy                           0.81      1000
    macro avg       0.82      0.81      0.81      1000
 weighted avg       0.82      0.81      0.81      1000

0.814
               precision    recall  f1-score   support

  INFORMATIVE       0.86      0.72      0.78       472
UNINFORMATIVE       0.78      0.90      0.84       528

     accuracy                           0.81      1000
    macro avg       0.82      0.81      0.81      1000
 weighted avg       0.82      0.81      0.81      1000

0.785
               precision    recall  f1-score   support

  INFORMATIVE       0.77      0.78      0.77       472
UNINFORMATIVE       0.80      0.79      0.79       528

     accuracy                           0.79      1000
    macro avg       0.78      0.78      0.78      1000
 weighted avg       0.79      0.79     