In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import boto3



In [2]:
s3 = boto3.client('s3')
bucket = "disaster-tweets-refined"
paths = [
    "csv/run-1606575584664-part-r-00000",
    "csv/run-1606575584664-part-r-00001",
    "csv/run-1606575584664-part-r-00002",
    "csv/run-1606575584664-part-r-00003",
    "csv/run-1606575584664-part-r-00004",
    "csv/run-1606575584664-part-r-00005",
    "csv/run-1606575584664-part-r-00006",
    "csv/run-1606575584664-part-r-00007",
    "csv/run-1606575584664-part-r-00008",
    "csv/run-1606575584664-part-r-00009",
    "csv/run-1606575584664-part-r-00010"
]
dfs = []
for path in paths:
    res = s3.get_object(Bucket=bucket, Key=path)
    df_ = pd.read_csv(res['Body'], engine='c')
    dfs.append(df_)
df = pd.concat(dfs)

In [3]:
df['finished_lemma'] = df['finished_lemma'].astype(str)
df['target'] = df['target'].apply(lambda x: int(x) if str(x) == '0' or str(x) == '1' else 2)
df = df[df['target'] <= 1]
df

Unnamed: 0,id,keyword,location,text,target,finished_lemma,word_count,unique_word_count,stop_words_count
0,3085,dead,,Dozen of people reportedly dead in iceberg in ...,1,dozen peopl reportedli dead iceberg neelumvall...,19,18,39
1,5771,forest%20fires,Texas,' no pharrell only YOU can prevent forest fir...,0,pharrel prevent forest fire,11,10,11
2,5917,floods,"Karachi , Pakistan",It's literally been two weeks into 2020 and we...,1,liter week alreadi see australian fire volcano...,20,20,41
3,6779,lightning,"Leesburg, FL",.@dantwitty52 shuts the door on the Boom in th...,0,dantwitti shut door boom bottom half lightn co...,20,15,43
4,4823,emergency%20plan,"Cape Town, South Africa",The #Lionlife Assist Helpline aims to provide ...,0,lionlif assist helplin aim provid client good ...,19,19,48
...,...,...,...,...,...,...,...,...,...
1019,7700,nuclear%20reactor,,Butterfree was discovered gamboling behind the...,0,butterfre discov gambol behind omin nuclear re...,9,9,28
1020,8275,quarantine,The United States. Duhh,Many of them continue to claim that vaccines c...,0,mani continu claim vaccin cau autism realli si...,19,17,52
1021,8322,quarantined,,A multi-polar world is re-emerging. The econom...,0,multipolar world reemerg econom might usa sust...,19,18,47
1022,1191,blizzard,,Stats http://t.co/U7vavyrGv9,0,stat,2,2,10


In [4]:
df.shape

(16383, 9)

In [5]:
list(df.columns)

['id',
 'keyword',
 'location',
 'text',
 'target',
 'finished_lemma',
 'word_count',
 'unique_word_count',
 'stop_words_count']

In [6]:
# df["finished_lemma"] = df["finished_lemma"].apply(lambda x: " ".join(x))
# df.head()

In [7]:
df["finished_lemma"].head()

0    dozen peopl reportedli dead iceberg neelumvall...
1                          pharrel prevent forest fire
2    liter week alreadi see australian fire volcano...
3    dantwitti shut door boom bottom half lightn co...
4    lionlif assist helplin aim provid client good ...
Name: finished_lemma, dtype: object

In [8]:
tokenizer = Tokenizer(num_words=100000, oov_token="<OOV>")
tokenizer.fit_on_texts(df["finished_lemma"])
word_index = tokenizer.word_index

In [9]:
df["sequences"] = tokenizer.texts_to_sequences(df["finished_lemma"])
df.head()

Unnamed: 0,id,keyword,location,text,target,finished_lemma,word_count,unique_word_count,stop_words_count,sequences
0,3085,dead,,Dozen of people reportedly dead in iceberg in ...,1,dozen peopl reportedli dead iceberg neelumvall...,19,18,39,"[1696, 7, 2938, 80, 3621, 2119, 96, 273, 824, ..."
1,5771,forest%20fires,Texas,' no pharrell only YOU can prevent forest fir...,0,pharrel prevent forest fire,11,10,11,"[9093, 702, 197, 4]"
2,5917,floods,"Karachi , Pakistan",It's literally been two weeks into 2020 and we...,1,liter week alreadi see australian fire volcano...,20,20,41,"[338, 188, 304, 9, 351, 4, 117, 339, 576, 9094]"
3,6779,lightning,"Leesburg, FL",.@dantwitty52 shuts the door on the Boom in th...,0,dantwitti shut door boom bottom half lightn co...,20,15,43,"[9095, 963, 687, 3231, 2272, 514, 262, 18, 291..."
4,4823,emergency%20plan,"Cape Town, South Africa",The #Lionlife Assist Helpline aims to provide ...,0,lionlif assist helplin aim provid client good ...,19,19,48,"[9096, 912, 4931, 1775, 751, 2939, 23, 1776, 6..."


In [10]:
max_ = 0
for i in list(df["sequences"].values):
    if len(i) > max_:
        max_ = len(i)
print("tweet con mas tokens", max_)

tweet con mas tokens 23


In [11]:
df["sequences"] = pad_sequences(df["sequences"], maxlen=max_, padding="post").tolist()
df.head()

Unnamed: 0,id,keyword,location,text,target,finished_lemma,word_count,unique_word_count,stop_words_count,sequences
0,3085,dead,,Dozen of people reportedly dead in iceberg in ...,1,dozen peopl reportedli dead iceberg neelumvall...,19,18,39,"[1696, 7, 2938, 80, 3621, 2119, 96, 273, 824, ..."
1,5771,forest%20fires,Texas,' no pharrell only YOU can prevent forest fir...,0,pharrel prevent forest fire,11,10,11,"[9093, 702, 197, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
2,5917,floods,"Karachi , Pakistan",It's literally been two weeks into 2020 and we...,1,liter week alreadi see australian fire volcano...,20,20,41,"[338, 188, 304, 9, 351, 4, 117, 339, 576, 9094..."
3,6779,lightning,"Leesburg, FL",.@dantwitty52 shuts the door on the Boom in th...,0,dantwitti shut door boom bottom half lightn co...,20,15,43,"[9095, 963, 687, 3231, 2272, 514, 262, 18, 291..."
4,4823,emergency%20plan,"Cape Town, South Africa",The #Lionlife Assist Helpline aims to provide ...,0,lionlif assist helplin aim provid client good ...,19,19,48,"[9096, 912, 4931, 1775, 751, 2939, 23, 1776, 6..."


In [12]:
from sklearn.decomposition import TruncatedSVD

# SVD represent documents and terms in vectors 
svd_model = TruncatedSVD(n_components=15, algorithm='randomized', n_iter=100, random_state=122)

sequences_reduced = svd_model.fit_transform(df["sequences"].values.tolist())
df['sequences_reduced'] = sequences_reduced.tolist()

In [13]:
tweets = df['sequences_reduced'].apply(pd.Series)
tweets.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,4008.669019,1102.149714,481.835229,-1693.92274,315.912593,434.832617,3123.678018,-640.203515,-231.361269,310.366664,561.048451,519.512095,-449.081792,-81.286353,-23.816009
1,5462.902321,-7011.936967,-1791.688623,-808.362307,406.781704,134.339053,340.648088,-117.018084,-115.566079,-80.856836,-12.104006,-43.382143,-68.870046,4.044583,-10.503145
2,2328.060448,1673.043111,-1817.113314,2129.480108,1857.502721,-2054.247078,766.63404,-1942.809781,2332.733613,837.363596,-7006.171338,-422.200903,-158.461692,-72.019404,22.856836
3,8410.44105,-4667.644346,-1714.385504,-2074.531249,1060.121414,-2546.022512,-63.232348,-1042.496881,1260.999463,242.82499,-3761.298204,-344.850164,-247.615863,-56.304228,-18.888663
4,9623.297581,-3944.170294,-552.647615,-1594.942286,2682.487074,1963.546781,779.462902,-1616.115315,-1119.173702,1278.909882,-590.541371,1980.928911,-951.742318,-208.58363,-34.657249


In [14]:
newDF = df[['unique_word_count', 'stop_words_count']]

In [15]:
tweets = pd.concat([tweets, newDF], axis=1)
tweets.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,unique_word_count,stop_words_count
0,4008.669019,1102.149714,481.835229,-1693.92274,315.912593,434.832617,3123.678018,-640.203515,-231.361269,310.366664,561.048451,519.512095,-449.081792,-81.286353,-23.816009,18,39
1,5462.902321,-7011.936967,-1791.688623,-808.362307,406.781704,134.339053,340.648088,-117.018084,-115.566079,-80.856836,-12.104006,-43.382143,-68.870046,4.044583,-10.503145,10,11
2,2328.060448,1673.043111,-1817.113314,2129.480108,1857.502721,-2054.247078,766.63404,-1942.809781,2332.733613,837.363596,-7006.171338,-422.200903,-158.461692,-72.019404,22.856836,20,41
3,8410.44105,-4667.644346,-1714.385504,-2074.531249,1060.121414,-2546.022512,-63.232348,-1042.496881,1260.999463,242.82499,-3761.298204,-344.850164,-247.615863,-56.304228,-18.888663,15,43
4,9623.297581,-3944.170294,-552.647615,-1594.942286,2682.487074,1963.546781,779.462902,-1616.115315,-1119.173702,1278.909882,-590.541371,1980.928911,-951.742318,-208.58363,-34.657249,19,48


In [16]:
tweets = pd.concat([tweets, df.iloc[:,3:]], axis=1)
tweets = tweets.drop(columns=["finished_lemma", "sequences", "sequences_reduced"])
tweets.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,unique_word_count,stop_words_count,text,target,word_count,unique_word_count.1,stop_words_count.1
0,4008.669019,1102.149714,481.835229,-1693.92274,315.912593,434.832617,3123.678018,-640.203515,-231.361269,310.366664,...,-449.081792,-81.286353,-23.816009,18,39,Dozen of people reportedly dead in iceberg in ...,1,19,18,39
1,5462.902321,-7011.936967,-1791.688623,-808.362307,406.781704,134.339053,340.648088,-117.018084,-115.566079,-80.856836,...,-68.870046,4.044583,-10.503145,10,11,' no pharrell only YOU can prevent forest fir...,0,11,10,11
2,2328.060448,1673.043111,-1817.113314,2129.480108,1857.502721,-2054.247078,766.63404,-1942.809781,2332.733613,837.363596,...,-158.461692,-72.019404,22.856836,20,41,It's literally been two weeks into 2020 and we...,1,20,20,41
3,8410.44105,-4667.644346,-1714.385504,-2074.531249,1060.121414,-2546.022512,-63.232348,-1042.496881,1260.999463,242.82499,...,-247.615863,-56.304228,-18.888663,15,43,.@dantwitty52 shuts the door on the Boom in th...,0,20,15,43
4,9623.297581,-3944.170294,-552.647615,-1594.942286,2682.487074,1963.546781,779.462902,-1616.115315,-1119.173702,1278.909882,...,-951.742318,-208.58363,-34.657249,19,48,The #Lionlife Assist Helpline aims to provide ...,0,19,19,48


In [17]:
tweets.shape

(16383, 22)

In [18]:
list(tweets.columns)

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 'unique_word_count',
 'stop_words_count',
 'text',
 'target',
 'word_count',
 'unique_word_count',
 'stop_words_count']

In [27]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(tweets.drop(columns=["target", "text"]), tweets['target'], test_size=0.3, random_state=0)
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)

In [28]:
from sklearn.metrics import f1_score, roc_auc_score

print("f1", f1_score(y_test, y_pred, average='weighted'))
print("roc_auc", roc_auc_score(y_test, y_pred))

f1 0.6357229735323059
roc_auc 0.50981852543405
