### This file consists of the same preprocessing pipeline as ex01_V2, hence we will not comment on any of the preprocessing

In [13]:
import csv
import re

import numpy as np
import pandas as pd
import seaborn as sns
import sklearn

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import zero_one_loss
from sklearn.linear_model import LogisticRegression

from sklearn.neural_network import MLPClassifier

In [2]:
url_train_dev = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vTOZ2rC82rhNsJduoyKYTsVeH6ukd7Bpxvxn_afOibn3R-eadZGXu82eCU9IRpl4CK_gefEGsYrA_oM/pub?gid=1863430984&single=true&output=tsv'
url_test = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vT-KNR9nuYatLkSbzSRgpz6Ku1n4TN4w6kKmFLkA6QJHTfQzmX0puBsLF7PAAQJQAxUpgruDd_RRgK7/pub?gid=417546901&single=true&output=tsv'

In [3]:
from io import StringIO
import requests

def load_dataset(url):
    r = requests.get(url)
    data = r.content.decode('utf8')
    df = pd.read_csv(StringIO(data), sep='\t')
    df.columns = ['tweet', 'label']
    return df

In [4]:
df_train_dev = load_dataset(url_train_dev)
df_test = load_dataset(url_test)

In [5]:
def convert_lowercase(txt):
    txt = txt.lower()
    return txt

def remove_url(text):
    re_url = re.compile('https?://\S+|www\.\S+')
    return re_url.sub('', text)  # I received an TypeError, so i changed 'text' to str(text) 

import string
exclude = string.punctuation
def remove_punc(text):
    return text.translate(str.maketrans('', '', exclude))

import emoji
def demojize_emojis(text):
    return emoji.demojize(text)

def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)
    
# function to remove special characters
def remove_special(text):
    x=''
    for i in text:
        if i.isalnum():
            x=x+i
        else:
            x=x+' '
    return x

# not sure if this is needed;
def none_to_empty_string(text):
    if not text:
        text = ""
    return text

In [6]:
X_train_dev = df_train_dev['tweet'].apply(remove_emoji)
X_train_dev = X_train_dev.apply(convert_lowercase)
X_train_dev = X_train_dev.apply(remove_url)
X_train_dev = X_train_dev.apply(remove_punc)
X_train_dev = X_train_dev.apply(none_to_empty_string)


In [7]:
def custom_encoder(v, d):
    if d == None:
        d = {}
    cur_value = 0
    for i in range(len(v)):
        if v[i] in d:
            v[i] = d[v[i]]
        else:
            d[v[i]] = cur_value
            v[i] = d[v[i]]
            cur_value += 1
    return v, d

In [8]:
y_train_dev, current_dictionary = custom_encoder(df_train_dev['label'], None)
y_train_dev = y_train_dev.astype('int')

In [9]:
X_train, X_val, y_train, y_val = train_test_split(X_train_dev, y_train_dev, test_size=0.1, random_state=99)

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words=None)
vectorizer.fit(X_train)
X_train = vectorizer.transform(X_train)
X_val = vectorizer.transform(X_val)

In [11]:
X_test = df_test['tweet'].apply(remove_emoji)
X_test = X_test.apply(convert_lowercase)
X_test = X_test.apply(remove_url)
X_test = X_test.apply(remove_punc)
X_test = X_test.apply(none_to_empty_string)

In [12]:
X_test = vectorizer.transform(X_test)
y_test = custom_encoder(df_test['label'], current_dictionary)[0]

### Now, we will use the MLP as our model instead of a linear model

In [None]:
mod = MLPClassifier(random_state=99, max_iter=100)
mod.fit(X_train, y_train)

In [None]:
y_train_pred = mod.predict(X_val)

In [None]:
print((1 - zero_one_loss(y_val, y_pred))*100, '%')

let's look at the performance on test data