In [1]:
import pandas as pd
import numpy as np

In [2]:
#Reading datasets into panda dataframes(~sql table of sorts)
train_df = pd.read_csv('train.tsv',sep='\t')
test_df = pd.read_csv('unlabeled_test_with_noise.tsv',sep='\t')

In [3]:
train_df.head()

Unnamed: 0,Id,Text,Label
0,1241490299215634434,Official death toll from #covid19 in the Unite...,INFORMATIVE
1,1245916400981381130,"Dearest Mr. President @USER 1,169 coronavirus ...",INFORMATIVE
2,1241132432402849793,Latest Updates March 20 ⚠️5274 new cases and 3...,INFORMATIVE
3,1236107253666607104,真把公主不当干部 BREAKING: 21 people on Grand Princess...,INFORMATIVE
4,1239673817552879619,OKLAHOMA CITY — The State Department of Educat...,UNINFORMATIVE


In [4]:
import nltk 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\anush\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
#downloading list of stopwords
nltk.download("stopwords")
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anush\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\anush\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
stop_words = set(stopwords.words('english'))
import regex as re
import gensim 
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem.porter import PorterStemmer 
from nltk.stem import WordNetLemmatizer

#Function to preprocess data with Gensim 
def preprocess_gensim(text):
    #removing alphanumeric chars from data 
    text = [re.sub(r'[^a-zA-Z0-9]',' ',text) for text in text]
    
    #lemmatize, stem and tokenize words in the dataset, removing stopwords 
    text = [(PorterStemmer().stem(WordNetLemmatizer().lemmatize(w,pos='v')) ) for w in text]
    result = [[token for token in gensim.utils.simple_preprocess(sentence) if not token in
              gensim.parsing.preprocessing.STOPWORDS and len(token)>3] for sentence in text]
    return result

In [7]:
from sklearn.model_selection import train_test_split

#Split data into train and test data
X_train, X_test, y_train, y_test = train_test_split(train_df['Text'].to_list(), train_df['Label'].to_list(),
                                                    random_state=0)
# Carry out preprocessing on text data
words_train, words_test = preprocess_gensim(X_train), preprocess_gensim(X_test)

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

# Extract Bag-of-Words (BoW)
vectorizer = CountVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x)
features_train = vectorizer.fit_transform(words_train).toarray()

features_test = vectorizer.transform(words_test).toarray()

# Create a vocabulary from the dataset
vocabulary = vectorizer.vocabulary_

In [9]:
import sklearn.preprocessing as pr

# Normalize BoW features in training and test set
features_train = pr.normalize(features_train, axis=0)
features_test = pr.normalize(features_test, axis=0)

In [10]:
from sklearn.naive_bayes import GaussianNB

# Train a Guassian Naive Bayes classifier
nb = GaussianNB()

nb.fit(features_train, y_train)

# Calculate the mean accuracy score on training and test sets
print("[{}] Accuracy: train = {}, test = {}".format(
        nb.__class__.__name__,
        nb.score(features_train, y_train),
        nb.score(features_test, y_test)))


[GaussianNB] Accuracy: train = 0.9567474048442907, test = 0.6735870818915801


In [11]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(max_iter=1000, random_state=0).fit(features_train, y_train)

print("[{}] Accuracy: train = {}, test = {}".format(
        logreg.__class__.__name__,
        logreg.score(features_train, y_train),
        logreg.score(features_test, y_test)))

[LogisticRegression] Accuracy: train = 0.9819300269127259, test = 0.7993079584775087


In [16]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(n_estimators=5, learning_rate=1.0, max_depth=1, random_state=0)
gbc.fit(features_train, y_train)

print('Accuracy of the GBM on training set: {:.3f}'.format(gbc.score(features_train, y_train)))
print('Accuracy of the GBM on test set: {0:.3f}'.format(gbc.score(features_test, y_test)))

Accuracy of the GBM on training set: 0.722
Accuracy of the GBM on test set: 0.730


In [22]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold

params_NB = {'var_smoothing': np.logspace(0,-9, num=10)}
cv_method = RepeatedStratifiedKFold(n_splits=2, 
                                    n_repeats=3, 
                                    random_state=0)

gs_NB = GridSearchCV(estimator=nb, 
                     param_grid=params_NB, 
                     cv=cv_method,
                     verbose=1, 
                     scoring='accuracy')


gs_NB.fit(features_train, y_train)

print(gs_NB.best_params_)
print(gs_NB.best_score_)

Fitting 6 folds for each of 10 candidates, totalling 60 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:  4.6min finished
{'var_smoothing': 0.0001}
0.6557093425605536


In [23]:
param_grid = {'C': [100, 10, 1.0, 0.1, 0.01]}
k = RepeatedStratifiedKFold(n_splits=2, n_repeats=3, random_state=0)

grid = GridSearchCV(logreg, param_grid=param_grid, cv=k, n_jobs=4, verbose=1)
grid.fit(features_train, y_train)

print('Best C:', grid.best_params_)

print('Accuracy of the GridSearch on training set: {:.3f}'.format(grid.score(features_train, y_train)))
print('Accuracy of the GridSearch on test set: {0:.3f}'.format(grid.score(features_test, y_test)))

Fitting 6 folds for each of 5 candidates, totalling 30 fits
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:  1.1min finished
Best C: {'C': 1.0}
Accuracy of the GridSearch on training set: 0.982
Accuracy of the GridSearch on test set: 0.799


In [24]:
from sklearn.model_selection import cross_val_score   #Additional scklearn functions

cv_score1 = cross_val_score(gbc, features_train, y_train, cv=3, scoring='roc_auc')
cv_score2 = cross_val_score(nb, features_train, y_train, cv=3, scoring='roc_auc')
cv_score3 = cross_val_score(logreg, features_train, y_train, cv=3, scoring='roc_auc')
cv_score4 = cross_val_score(grid, features_train, y_train, cv=3, scoring='roc_auc')

Fitting 6 folds for each of 5 candidates, totalling 30 fits
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:   37.4s finished
Fitting 6 folds for each of 5 candidates, totalling 30 fits
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:   34.1s finished
Fitting 6 folds for each of 5 candidates, totalling 30 fits
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:   43.2s finished


In [25]:
print(cv_score1)
print(cv_score2)
print(cv_score3)
print(cv_score4)

[0.75300204 0.75195611 0.76729618]
[0.67346809 0.68211702 0.63101659]
[0.87105459 0.87818198 0.8591318 ]
[0.87105459 0.87818198 0.8591318 ]


In [26]:
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

y_pred1 = nb.predict(features_test)
y_pred2 = logreg.predict(features_test)
y_pred3 = grid.predict(features_test)


print("---Test Set Results---")
print("Accuracy with Gaussian: {}".format(accuracy_score(y_test, y_pred1)))
print("Accuracy with Logistic regression: {}".format(accuracy_score(y_test, y_pred2)))
print("Accuracy with tuned Logistic regression: {}".format(accuracy_score(y_test, y_pred3)))


---Test Set Results---
Accuracy with Gaussian: 0.6735870818915801
Accuracy with Logistic regression: 0.7993079584775087
Accuracy with tuned Logistic regression: 0.7993079584775087


In [1]:
pip install tensorflow --user

Note: you may need to restart the kernel to use updated packages.


In [1]:
import tensorflow as tf
from tensorflow import keras 

ImportError: Traceback (most recent call last):
  File "C:\Users\anush\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\pywrap_tensorflow.py", line 64, in <module>
    from tensorflow.python._pywrap_tensorflow_internal import *
ImportError: DLL load failed while importing _pywrap_tensorflow_internal: The specified module could not be found.


Failed to load the native TensorFlow runtime.

See https://www.tensorflow.org/install/errors

for some common reasons and solutions.  Include the entire stack trace
above this error message when asking for help.

In [12]:
#Switching to RNNs

from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(words_train)

X_train = tokenizer.texts_to_sequences(words_train)
X_test = tokenizer.texts_to_sequences(words_test)

vocabulary_size = len(tokenizer.word_index) + 1 
#Adding 1 because of the reserved 0 index

print(words_train[2]) 

ModuleNotFoundError: No module named 'keras'