# Real or Not? NLP with Disaster Tweets
*Gruppo:
Lorrai, Rossi*

In [1]:
import numpy as np
import pandas as pd
import string
import re

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn import linear_model

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from statistics import mean

import nltk
nltk.download('wordnet')

!pip install pyspellchecker

from nltk.stem import WordNetLemmatizer 
from spellchecker import SpellChecker

spell = SpellChecker()
lemmatizer = WordNetLemmatizer()


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
print("\nImport Completati")

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Collecting pyspellchecker
  Downloading pyspellchecker-0.5.4-py2.py3-none-any.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 2.8 MB/s 
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.5.4
/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/test.csv
/kaggle/input/nlp-getting-started/train.csv

Import Completati


In [2]:
#Eliminiamo eventuali tag -> Input "<title>titolo</title> : Output "titolo"
def cleanhtml(text):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', text)
    return cleantext

#Rimuoviamo i text contenenti solamente dei link, non sono significativi -> Input "https://link_al_sito.com" : Output ""
def removeurl(text):
    clean_text = re.sub(r'^http?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    return clean_text

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)
           
stop=set(stopwords.words('english') + ["http","https", "s", "nt", "m"] )

def adjust_spelling(text):
    if text is not None:
        tokens = [spell.correction(x) for x in (word_tokenize(text))]
        return " ".join(tokens)
    else:
        return None

def remove_stopwords(text):
    if text is not None:
        tokens = [x for x in word_tokenize(text) if x not in stop]
        return " ".join(tokens)
    else:
        return None
    
def lemmatize_words(text):
    if text is not None:
        tokens = [lemmatizer.lemmatize(x) for x in (word_tokenize(text))]
        return " ".join(tokens)
    else:
        return None

# ho tolto . 
def handle_punct_1(text):
    # punctuations
    punctuations = '@#!?+&*[]-%/()$=><|{}^:,;-' + "'`"
    for p in punctuations:
        #text = text.replace(p, f' {p} ')
        text = text.replace(p, f' ')
    return text

# maneggio .  
def handle_punct_2(text):
    clean_text = re.sub(r'\.\s', ' ', text, flags=re.MULTILINE)
    #clean_text = re.sub(r'\.$', ' ', text, flags=re.MULTILINE)
    return clean_text
def handle_punct_3(text):
    clean_text = re.sub(r'\.$', ' ', text, flags=re.MULTILINE)
    return clean_text

# sistema ... separandolo
def handle_punct_4(text):
    # ... and ..
    #text = text.replace(' . ', ' pippo ')
    text = text.replace('...', ' ... ')
    if '...' not in text:
        text = text.replace('..', ' ... ')
    elif '..' not in text: 
        text = text.replace('.', ' ... ')
    return text

def handle_char_entity(text):
    text = re.sub(r"&gt;", ">", text)
    text = re.sub(r"&lt;", "<", text)
    text = re.sub(r"&amp;", "&", text)
    return text

def remove_multi_whitespaces(text):
    clean_text = re.sub(r'\s+', ' ', text)
    return clean_text

In [3]:
def clean_text(table):
    
    # lower --> perdiamo i title!!!
    table['text'] = table['text'].apply(lambda x : x.lower())

    # clean html
    table['text'] = table['text'].apply(lambda x:cleanhtml(x))

    # remove single url tweet
    table['text'] = table['text'].apply(lambda x:removeurl(x))

    # remove emoji
    table['text'] = table['text'].apply(lambda x: remove_emoji(x))

    # vedi singole funzioni // migliore: rimuove punto dalle parole, separa punteggiatura
    table['text'] = table['text'].apply(lambda x : handle_punct_1(x))
    table['text'] = table['text'].apply(lambda x : handle_punct_2(x))
    table['text'] = table['text'].apply(lambda x : handle_punct_3(x))
    table['text'] = table['text'].apply(lambda x : handle_punct_4(x))

    # sistema &amp e altro con i char veri
    table['text'] = table['text'].apply(lambda x : handle_char_entity(x))
        
    # aggiusta spelling
    #table['text'] = table['text'].apply(lambda x: adjust_spelling(x))
    
    print("done with adjusting")
    
    # crea lemmi
    table['text'] = table['text'].apply(lambda x: lemmatize_words(x))

    # rimuove stopwords
    table['text'] = table['text'].apply(lambda x: remove_stopwords(x))

    # rimuove multi spazi bianchi
    table['text'] = table['text'].apply(lambda x : remove_multi_whitespaces(x))
    
    print("done")

    return table

In [4]:
test_data = pd.read_csv("../input/nlp-getting-started/test.csv")
train_data = pd.read_csv("../input/nlp-getting-started/train.csv")

train_data.dropna(axis=0, subset=['target'], inplace=True)

train_data

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [5]:
## Separo gli output y dagli input x
train_y = train_data["target"].values
train_data.drop(['target'], axis=1, inplace=True)

## Pulisco i dataset di train e test
test_data = clean_text(test_data)
train_data = clean_text(train_data)

print('setup complete')

train_data

done with adjusting
done
done with adjusting
done
setup complete


Unnamed: 0,id,keyword,location,text
0,1,,,deed reason earthquake may allah forgive u
1,4,,,forest fire near la ronge sask canada
2,5,,,resident asked shelter place notified officer ...
3,6,,,13 000 people receive wildfire evacuation orde...
4,7,,,got sent photo ruby alaska smoke wildfire pour...
...,...,...,...,...
7608,10869,,,two giant crane holding bridge collapse nearby...
7609,10870,,,aria_ahrary thetawniest control wild fire cali...
7610,10871,,,m1.94 01 04 utc 5km volcano hawaii t.co zdtoyd...
7611,10872,,,police investigating e bike collided car littl...


In [6]:
def tf_idf_transformation(features=None, ngrams=(1,1)):
    tfidf_vec = TfidfVectorizer(stop_words='english', max_features=features, ngram_range=ngrams)
    tfidf_vec.fit_transform(train_data['text'].values.tolist() + test_data['text'].values.tolist())

    train_tfidf = tfidf_vec.transform(train_data['text'].values.tolist())
    test_tfidf = tfidf_vec.transform(test_data['text'].values.tolist())

    return train_tfidf, test_tfidf

# Creazione del Modello - Logistic Regression

In [7]:
from tqdm import tqdm
from sklearn import model_selection, preprocessing, metrics, ensemble, naive_bayes, linear_model
 
## Definisce un modello, fitta, e predice
def runModel(model, train_X, train_y, test_X):
    #model = linear_model.LogisticRegression(C=1, solver='sag')
    model.fit(train_X, train_y)
    preds = model.predict_proba(test_X)[:,1]
    return preds

In [8]:
'''if __name__ == '__main__':
    
    params_used = []
    
    
    for num_features in [None, 3000, 8000, 15000]:
        for num_grams in [(1,1), (1,2)]:
            train_tfidf, test_tfidf = tf_idf_transformation(num_features, num_grams)

            #1, 1.5, 2.5, 5, 10, 15
            C_list = [0.1, 0.5, 1, 1.5, 2.5, 5, 10, 15]

            #usare 10
            num_splits = 10

            for c in C_list:
                
                param_with_score = {
                    "n_features" : None,
                    "n_grams" : (1,1),
                    "c": 0.1,
                    "score": 0,
                    "threshold": 0,
                }

                param_with_score["c"] = c
                param_with_score["n_grams"] = num_grams
                param_with_score["n_features"] = num_features

                model = linear_model.LogisticRegression(C=c, solver='sag')
                best_threshold = 0
                best_score = 0

                #usare 0.01 e range(100)
                for threshold in tqdm([i * 0.01 for i in range(100)]):
                    score = 0
                    kf = model_selection.KFold(n_splits=num_splits, shuffle=True, random_state = 0)
                    for train_index, val_index in kf.split(train_tfidf):
                        X_train, X_val = train_tfidf[train_index], train_tfidf[val_index]
                        y_train, y_val = train_y[train_index], train_y[val_index]

                        preds = runModel(model, X_train, y_train, X_val)
                        score += metrics.f1_score(y_true = y_val, y_pred = preds > threshold)

                    score /= num_splits

                    if score > best_score:
                        best_threshold = threshold
                        best_score = score

                param_with_score["score"] = best_score
                param_with_score["threshold"] = best_threshold

                params_used.append(param_with_score)
'''

'if __name__ == \'__main__\':\n    \n    params_used = []\n    \n    \n    for num_features in [None, 3000, 8000, 15000]:\n        for num_grams in [(1,1), (1,2)]:\n            train_tfidf, test_tfidf = tf_idf_transformation(num_features, num_grams)\n\n            #1, 1.5, 2.5, 5, 10, 15\n            C_list = [0.1, 0.5, 1, 1.5, 2.5, 5, 10, 15]\n\n            #usare 10\n            num_splits = 10\n\n            for c in C_list:\n                \n                param_with_score = {\n                    "n_features" : None,\n                    "n_grams" : (1,1),\n                    "c": 0.1,\n                    "score": 0,\n                    "threshold": 0,\n                }\n\n                param_with_score["c"] = c\n                param_with_score["n_grams"] = num_grams\n                param_with_score["n_features"] = num_features\n\n                model = linear_model.LogisticRegression(C=c, solver=\'sag\')\n                best_threshold = 0\n                best_score =

In [9]:
'''for i in range(0, len(params_used)):
    print("n_features:", params_used[i]["n_features"], "          n_grams:", params_used[i]["n_grams"], "C:", params_used[i]["c"], "          score:", params_used[i]["score"], "          threshold:", params_used[i]["threshold"])'''

'for i in range(0, len(params_used)):\n    print("n_features:", params_used[i]["n_features"], "          n_grams:", params_used[i]["n_grams"], "C:", params_used[i]["c"], "          score:", params_used[i]["score"], "          threshold:", params_used[i]["threshold"])'

In [10]:
'''index = 0

for i in range(0, len(params_used)):
    if (params_used[i]["score"] > params_used[index]["score"]):
        index = i
        
best_params = params_used[index]

best_params'''

'index = 0\n\nfor i in range(0, len(params_used)):\n    if (params_used[i]["score"] > params_used[index]["score"]):\n        index = i\n        \nbest_params = params_used[index]\n\nbest_params'

In [11]:
best_params = {
    "n_features": 8000,
    "n_grams": (1,1),
    "c" : 1.5,
    "score" : 0.7603229884729064,
    "threshold" : 0.43,
}

best_params

{'n_features': 8000,
 'n_grams': (1, 1),
 'c': 1.5,
 'score': 0.7603229884729064,
 'threshold': 0.43}

In [12]:
## Creo il modello, fitto, e predico i valori di test

best_train_tfidf, best_test_tfidf = tf_idf_transformation(best_params["n_features"], best_params["n_grams"])

model = linear_model.LogisticRegression(C=best_params["c"], solver='sag')
model.fit(best_train_tfidf, train_y)
pred_test = model.predict_proba(best_test_tfidf)[:,1]


final_preds = [1 if x > best_params["threshold"] else 0 for x in pred_test]

In [13]:
output = pd.DataFrame({'id': test_data['id'],
                       'target': final_preds})
output.to_csv('submission.csv', index=False)

# Primo approccio (Preprocessing) - Aggiungiamo Features Extra
Aggiungiamo delle Feature extra derivanti da quelle del dataset originale in modo da migliorare il modello finale

- num_word
- num_unique_word
- num_chars
- num_ stop_words (parole ignorate dai search engine: congiunzioni, parole molto frequenti e irrilevanti es: 'for' e 'of')
- num_punctuacion
- num_words_upper
- num_words_title (Programmazione a Numeri Interi : nel titolo di solito le parole importanti sono in maiusc)
- mean_word_len
- url_count
- hashtag_count
- mention_count

In [14]:
## Pulisco i dati dai tag html e text contenenti solo URL

def add_feature_pre_clean(table):
    ## Number of characters in the text ##
    table["num_chars"] = table["text"].apply(lambda x: len(str(x)))

    ## Number of stopwords in the text ##
    #table["num_stopwords"] = table["text"].apply(lambda x: len([w for w in str(x).lower().split() if w in stopwords.tolist()]))

    ## Number of punctuations in the text ##
    #table["num_punctuations"] = table['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]) )

    ## Number of title case words in the text ##
    table["num_words_upper"] = table["text"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))

    ## Number of title case words in the text ##
    table["num_words_title"] = table["text"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))

    ## Average length of the words in the text ##
    table["mean_word_len"] = table["text"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

    # url_count
    table['url_count'] = table['text'].apply(lambda x: len([w for w in str(x).lower().split() if 'http' in w or 'https' in w]))

    # hashtag_count
    table['hashtag_count'] = table['text'].apply(lambda x: len([c for c in str(x) if c == '#']))

    # mention_count
    table['mention_count'] = table['text'].apply(lambda x: len([c for c in str(x) if c == '@']))

    return table

def add_feature_post_clean(table):
    ## Number of words in the text ##
    table["num_words"] = table["text"].apply(lambda x: len(str(x).split()))

    ## Number of unique words in the text ##
    table["num_unique_words"] = table["text"].apply(lambda x: len(set(str(x).split())))

    return table