In [None]:
import numpy as np 
import pandas as pd 
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
import seaborn as sns

import xgboost as xgb
from xgboost import XGBClassifier

from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV,StratifiedKFold,RandomizedSearchCV

import os



In [None]:
train_df=pd.read_csv('/content/train.csv')
test_df=pd.read_csv('/content/test.csv')

In [None]:
disaster_tweets = train_df[train_df['target']==1]['text']
disaster_tweets.values[1]

'Forest fire near La Ronge Sask. Canada'

In [None]:
disaster_tweets = train_df[train_df['target']==0]['text']
disaster_tweets.values[1]

'I love fruits'

In [None]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [None]:
train_df['text'] = train_df['text'].apply(lambda x: clean_text(x))
test_df['text'] = test_df['text'].apply(lambda x : clean_text(x))

In [None]:
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
train_df['text'] = train_df['text'].apply(lambda x: tokenizer.tokenize(x))
test_df['text'] = test_df['text'].apply(lambda x: tokenizer.tokenize(x))

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
def remove_stopwords(text):
    words = [w for w in text if w not in stopwords.words('english')]
    return words

train_df['text'] = train_df['text'].apply(lambda x : remove_stopwords(x))
test_df['text'] = test_df['text'].apply(lambda x : remove_stopwords(x))

In [None]:
def combine_text(list_of_text):
    combined_text = ' '.join(list_of_text)
    return combined_text

train_df['text'] = train_df['text'].apply(lambda x : combine_text(x))
test_df['text'] = test_df['text'].apply(lambda x : combine_text(x))

In [None]:
def text_preprocessing(text):
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    
    nopunc = clean_text(text)
    tokenized_text = tokenizer.tokenize(nopunc)
    remove_stopwords = [w for w in tokenized_text if w not in stopwords.words('english')]
    combined_text = ' '.join(remove_stopwords)
    return combined_text

In [None]:
count_vectorizer = CountVectorizer()
train_df_vectors = count_vectorizer.fit_transform(train_df['text'])
test_df_vectors = count_vectorizer.transform(test_df["text"])

## Keeping only non-zero elements to preserve space 
print(train_df_vectors[0].todense())

[[0 0 0 ... 0 0 0]]


In [None]:
# TFIDF Features (Term Frequency-Inverse Document Frequency, or TF-IDF for short)

tfidf = TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1, 2))
train_df_tfidf = tfidf.fit_transform(train_df['text'])
test_df_tfidf = tfidf.transform(test_df["text"])

In [None]:
# Fitting a simple Logistic Regression on Counts
clf = LogisticRegression(C=1.0)
scores = model_selection.cross_val_score(clf, train_df_vectors, train_df["target"], cv=5, scoring="f1")
scores


array([0.59865255, 0.49611063, 0.57166948, 0.56290774, 0.68789809])

In [None]:
clf.fit(train_df_vectors, train_df["target"])


LogisticRegression()

In [None]:
# Fitting a simple Logistic Regression on TFIDF
clf_tfidf = LogisticRegression(C=1.0)
scores = model_selection.cross_val_score(clf_tfidf, train_df_tfidf, train_df["target"], cv=5, scoring="f1")
scores

array([0.57229525, 0.49673203, 0.54277829, 0.46618106, 0.64768683])

In [None]:
# Fitting a simple Naive Bayes on Counts

clf_NB = MultinomialNB()
scores = model_selection.cross_val_score(clf_NB, train_df_vectors, train_df["target"], cv=5, scoring="f1")
scores

array([0.63149079, 0.60675773, 0.68575519, 0.64341085, 0.72505092])

In [None]:
# Fitting a simple Naive Bayes on TFIDF

clf_NB_TFIDF = MultinomialNB()
scores = model_selection.cross_val_score(clf_NB_TFIDF, train_df_tfidf, train_df["target"], cv=5, scoring="f1")
scores

array([0.57590597, 0.57092511, 0.61135371, 0.5962963 , 0.7393745 ])

In [None]:
import xgboost as xgb
clf_xgb = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
scores = model_selection.cross_val_score(clf_xgb, train_df_vectors, train_df["target"], cv=5, scoring="f1")
scores

array([0.47379913, 0.37379576, 0.43988816, 0.38900634, 0.53142857])

In [None]:
import xgboost as xgb
clf_xgb_TFIDF = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
scores = model_selection.cross_val_score(clf_xgb_TFIDF, train_df_tfidf, train_df["target"], cv=5, scoring="f1")
scores

array([0.48947951, 0.34406439, 0.43140965, 0.40084388, 0.53014354])

In [None]:
y_pred = clf.predict(test_df_vectors)
y_pred = np.round(y_pred).astype(int).reshape(3263)

sub = pd.DataFrame({'id': test_df['id'].values.tolist(),'target' : y_pred})

sub.to_csv('submission.csv',index = False)