In [143]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt

In [144]:
data = pd.read_csv('train.csv')

data

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [145]:
data['text'] = data['text'].str.upper()

In [146]:
data['text'] = data['text'].str.replace(r"&AMP", ' ', regex=True)
data['text'] = data['text'].str.replace(r"&GT", ' ', regex=True)
data['text'] = data['text'].str.replace(r"@+", '', regex=True)
data['text'] = data['text'].str.replace("?", '? ')
data['text'] = data['text'].str.replace("!", '! ')
data['text'] = data['text'].str.replace(r"HTTP\S+", '', regex=True)
data['text'] = data['text'].str.replace(r"[^A-Z0-9 ]", ' ', regex=True)
data['text'] = data['text'].str.replace(r"\d+(?=[A-Z])", '', regex=True)
data['text'] = data['text'].str.replace("Á", 'A', regex=True)
data['text'] = data['text'].str.replace("É", 'E', regex=True)
data['text'] = data['text'].str.replace("Í", 'I', regex=True)
data['text'] = data['text'].str.replace("Ó", 'O', regex=True)
data['text'] = data['text'].str.replace("Ú", 'U', regex=True)
data['text'] = data['text'].str.replace(r" +", ' ', regex=True)

In [147]:
data['location'] = data['location'].str.upper()

In [148]:
data['location'] = data['location'].str.replace(r"[^A-Z0-9 ]", '', regex=True)
data['location'] = data['location'].str.replace(r" +", ' ', regex=True)
data['location'] = data['location'].replace(" ", np.nan)

In [149]:
data['keyword'] = data['keyword'].str.upper()
data['keyword'] = data['keyword'].str.replace("%20", ' ', regex=True)

In [150]:
for tweet in data:
    print(tweet)

id
keyword
location
text
target


In [151]:
disaster_tweets = data[data['target'] == 1]['text'].tolist()
non_disaster_tweets = data[data['target'] == 0]['text'].tolist()

In [152]:
import nltk
from nltk import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/melanydonis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/melanydonis/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/melanydonis/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [153]:
stop_words = set(stopwords.words('english'))

def get_word_freq(tweets):
    words = []
    for tweet in tweets:
        words.extend([word.lower() for word in word_tokenize(tweet) if word.isalnum() and word.lower() not in stop_words])
    return FreqDist(words)

# Assuming disaster_tweets and non_disaster_tweets are defined
disaster_freq = get_word_freq(disaster_tweets)
non_disaster_freq = get_word_freq(non_disaster_tweets)

In [154]:
disaster_common = disaster_freq.most_common(20)
non_disaster_common = non_disaster_freq.most_common(20)

print("Palabras más comunes en tweets de desastres:")
print(disaster_common)
print("\nPalabras más comunes en tweets que no son de desastres:")
print(non_disaster_common)

Palabras más comunes en tweets de desastres:
[('co', 2528), ('http', 2382), ('fire', 182), ('news', 144), ('disaster', 121), ('via', 121), ('california', 115), ('suicide', 112), ('police', 109), ('people', 105), ('2', 102), ('killed', 95), ('like', 94), ('hiroshima', 92), ('storm', 89), ('fires', 86), ('pm', 86), ('crash', 85), ('families', 81), ('train', 79)]

Palabras más comunes en tweets que no son de desastres:
[('co', 2212), ('http', 1927), ('like', 254), ('new', 171), ('get', 163), ('one', 133), ('body', 116), ('2', 114), ('would', 101), ('via', 99), ('video', 96), ('people', 95), ('got', 94), ('love', 91), ('day', 86), ('know', 86), ('time', 85), ('back', 85), ('3', 84), ('full', 84)]


In [155]:
from nltk import bigrams, trigrams

def get_ngram_freq(tweets, n):
    ngrams = []
    for tweet in tweets:
        tokens = [word.lower() for word in word_tokenize(tweet) if word.isalnum() and word.lower() not in stop_words]
        if n == 2:
            ngrams.extend(list(bigrams(tokens)))
        elif n == 3:
            ngrams.extend(list(trigrams(tokens)))
    return FreqDist(ngrams)

disaster_bigrams = get_ngram_freq(disaster_tweets, 2)
disaster_trigrams = get_ngram_freq(disaster_tweets, 3)

print("\nBigramas más comunes en tweets de desastres:")
print(disaster_bigrams.most_common(10))
print("\nTrigramas más comunes en tweets de desastres:")
print(disaster_trigrams.most_common(10))


Bigramas más comunes en tweets de desastres:
[(('http', 'co'), 2382), (('suicide', 'bomber'), 59), (('northern', 'california'), 41), (('oil', 'spill'), 38), (('burning', 'buildings'), 36), (('suicide', 'bombing'), 35), (('california', 'wildfire'), 34), (('70', 'years'), 30), (('bomber', 'detonated'), 30), (('confirmed', 'mh370'), 29)]

Trigramas más comunes en tweets de desastres:
[(('suicide', 'bomber', 'detonated'), 30), (('northern', 'california', 'wildfire'), 29), (('latest', 'homes', 'razed'), 28), (('homes', 'razed', 'northern'), 28), (('pkk', 'suicide', 'bomber'), 28), (('bomber', 'detonated', 'bomb'), 28), (('razed', 'northern', 'california'), 27), (('yr', 'old', 'pkk'), 27), (('old', 'pkk', 'suicide'), 27), (('families', 'sue', 'legionnaires'), 26)]
