In [None]:
import pandas as pd
import nltk
import spacy
import seaborn as sns
import numpy as np

from tqdm import tqdm
from matplotlib import pyplot as plt
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier

from sklearn.model_selection import GridSearchCV
from transformers import BertTokenizer, BertForSequenceClassification


In [None]:
pd.options.plotting.backend = "matplotlib"

In [None]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

In [None]:
df_train.tail(20)

In [None]:
df_train.info()

In [None]:
df_train.dropna(inplace=True)

In [None]:
df_train.describe()

In [None]:
df_train.target.value_counts().plot(kind='bar')

In [None]:
nltk.download('stopwords')

In [None]:
snlp = spacy.load('en_core_web_sm')

In [None]:
def lemmatizer(text):
    result = [word.lemma_ for word in snlp(text)]
    return " ".join(result)

In [None]:
def lemmatizer_verbs(text):
    sent = []
    doc = nlp(text)
    for word in doc:
        if word.pos_ == "VERB":
            sent.append(word.lemma_)
        else:
            sent.append(word.text)
    return " ".join(sent)

In [None]:
def preprocess(df):
    df['text_lemma'] = df.text.apply(lemmatizer)
    df['text_lemma_verbs'] = df.text.apply(lemmatizer)
    return df

In [None]:
stopwords = list(set(snlp.Defaults.stop_words).union(set(nltk.corpus.stopwords.words('english'))))

In [None]:
df_train = preprocess(df_train)

In [None]:
negative_cases = "".join(df_train[df_train['target'] == 0]['text_lemma_verbs'].values)
positive_cases = "".join(df_train[df_train['target'] == 1]['text_lemma_verbs'].values)

fig, axs = plt.subplots(1, 2, figsize=(15, 10))

wc1 = WordCloud(background_color='white').generate(negative_cases)
wc2 = WordCloud(background_color='white').generate(positive_cases)

axs[0].imshow(wc1, interpolation='bilinear')
axs[0].set_title('Negative')
axs[1].imshow(wc2, interpolation='bilinear')
axs[1].set_title('Positive')

plt.show()

In [None]:
X = df_train.text_lemma_verbs
y = df_train['target'].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

# Count Vectorizer & Random Forest

In [None]:
rf_pipeline = Pipeline([
    ('CountVectorizer', CountVectorizer(stop_words=stopwords)),
    ('rf', RandomForestClassifier(random_state=42))
])

In [None]:
rf_pipeline.fit(X_train, y_train) 

In [None]:
y_pred = rf_pipeline.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
print(accuracy_score(y_test, y_pred))

In [None]:
cnf_matrix = confusion_matrix(y_test, y_pred)
cmap = sns.color_palette("Blues", as_cmap=True)
sns.heatmap(cnf_matrix, annot=True, fmt='g', cmap=cmap, 
            xticklabels=['Negativo', 'Positivo'], 
            yticklabels=['Negativo', 'Positivo'])

plt.title('Matriz de Confusão')
plt.xlabel('Valor Previsto')
plt.ylabel('Valor Real')
plt.show()

# TFIDF & RF

In [None]:
rf_pipeline2 = Pipeline([
    ('TFIDF', TfidfVectorizer(stop_words=stopwords)),
    ('rf', RandomForestClassifier(random_state=42))
])

rf_pipeline2.fit(X_train, y_train) 
y_pred2 = rf_pipeline2.predict(X_test)

print(classification_report(y_test, y_pred2))
print(accuracy_score(y_test, y_pred2))

cnf_matrix = confusion_matrix(y_test, y_pred2)
cmap = sns.color_palette("Blues", as_cmap=True)
sns.heatmap(cnf_matrix, annot=True, fmt='g', cmap=cmap, 
            xticklabels=['Negativo', 'Positivo'], 
            yticklabels=['Negativo', 'Positivo'])

plt.title('Matriz de Confusão')
plt.xlabel('Valor Previsto')
plt.ylabel('Valor Real')
plt.show()

# SVC & TFIDF

In [None]:
svc = Pipeline([
    ('TFIDF', TfidfVectorizer(stop_words=stopwords)),
    ('rf', LinearSVC(random_state=42))
])

svc.fit(X_train, y_train) 
y_pred3 = svc.predict(X_test)

print(classification_report(y_test, y_pred3))
print(accuracy_score(y_test, y_pred3))

cnf_matrix = confusion_matrix(y_test, y_pred3)
cmap = sns.color_palette("Blues", as_cmap=True)
sns.heatmap(cnf_matrix, annot=True, fmt='g', cmap=cmap, 
            xticklabels=['Negativo', 'Positivo'], 
            yticklabels=['Negativo', 'Positivo'])

plt.title('Matriz de Confusão')
plt.xlabel('Valor Previsto')
plt.ylabel('Valor Real')
plt.show()

# XGB

In [None]:
xgb = Pipeline([
    ('TFIDF', CountVectorizer(stop_words=stopwords)),
    ('xbg', XGBClassifier(random_state=42))
])

xgb.fit(X_train, y_train) 
y_pred4 = xgb.predict(X_test)

print(classification_report(y_test, y_pred4))
print(accuracy_score(y_test, y_pred4))

cnf_matrix = confusion_matrix(y_test, y_pred4)
cmap = sns.color_palette("Blues", as_cmap=True)
sns.heatmap(cnf_matrix, annot=True, fmt='g', cmap=cmap, 
            xticklabels=['Negativo', 'Positivo'], 
            yticklabels=['Negativo', 'Positivo'])

plt.title('Matriz de Confusão')
plt.xlabel('Valor Previsto')
plt.ylabel('Valor Real')
plt.show()

In [None]:
params_to_search = {
    'TFIDF__max_features': [500, 1000, 2000],
    'xbg__max_depth': [3, 5, 7],
    'xbg__n_estimators': [100, 500, 1000]
}

grid = GridSearchCV(xgb, param_grid=params_to_search, cv=5)
grid.fit(X_train, y_train)

print(grid.best_params_)
print(grid.best_score_)

# BERT

In [None]:
# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

bert_svc = Pipeline([
    ('tokenizer', tokenizer),
    ('model', model)
])

bert_svc.fit(X_train, y_train)
y_pred5 = bert_svc.predict(X_test)

print(classification_report(y_test, y_pred5))
print(accuracy_score(y_test, y_pred5))


# Submit dataset

In [None]:
df_new = pd.DataFrame()
df_new['id'] = df_test['id']
vectors_test = vec_model.transform(df_test['text'])
df_new['target'] = random_forest.predict(vectors_test)

In [None]:
df_new

In [None]:
df_new.target.value_counts().plot(kind='bar')

In [None]:
df_new.shape

In [None]:
df_test2 = df_test.merge(df_new)

In [None]:
df_test2.head()

In [None]:
negative_cases = "".join(df_test2[df_test2['target'] == 0]['text'].values)
positive_cases = "".join(df_test2[df_test2['target'] == 1]['text'].values)

fig, axs = plt.subplots(1, 2, figsize=(15, 10))

wc1 = WordCloud(background_color='white').generate(negative_cases)
wc2 = WordCloud(background_color='white').generate(positive_cases)

axs[0].imshow(wc1, interpolation='bilinear')
axs[0].set_title('Negative')
axs[1].imshow(wc2, interpolation='bilinear')
axs[1].set_title('Positive')

plt.show()

In [None]:
df_new.to_csv('submission.csv', index=False)