In [None]:
#!pip install textblob

In [1]:
import pandas as pd
import os
import string
from textblob import TextBlob
from utils.cleaning_tools import *

In [2]:
train_file = './Data/train.csv'
test_file = './Data/test.csv'
train_enriched_file = './Data/train_enriched.csv'
test_enriched_file = './Data/test_enriched.csv'
special_char = '~:;}]{[!@#$%^&*()_+=-><,.|\/?\'\"'
hash_char = '#'
at_char = '@'

In [3]:
train_df = pd.read_csv(train_file,encoding='utf-8')
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


### add features

In [4]:
def add_features(df):
    df['word_count'] = df['text'].apply(lambda t: len(t.split()))
    df['unique_words_count'] = df['text'].apply(lambda t: len(set(t.split())))
    df['Tweet_len'] = df['text'].apply(lambda t: len(t))
    df['special_chars_count'] = df['text'].apply(lambda t: len([c for c in str(t) if c in special_char]))
    #df['punctuation_count'] = df['text'].apply(lambda t: len([c for c in str(t) if c in string.punctuation]))
    df['hash_count'] = df['text'].apply(lambda t: len([c for c in str(t) if c in hash_char]))
    df['@_count'] = df['text'].apply(lambda t: len([c for c in str(t) if c in at_char]))
    df['URL_count']  = df['text'].apply(lambda t: len([w for w in str(t).lower().split() if 'http' in w or 'https' in w]))
    df['sentiment'] = df['text'].apply(lambda t: TextBlob(t).sentiment.polarity)
    df['subjectivity'] = df['text'].apply(lambda t: TextBlob(t).sentiment.subjectivity)
    
    return df

In [5]:
# The polarity score is a float within the range [-1.0, 1.0], where -1 is very negative and +1 very positive
# The subjectivity is a float within the range [0.0, 1.0], where 0.0 is very objective and 1.0 is very subjective.

In [6]:
train_df = add_features(train_df)
train_df.sample()

Unnamed: 0,id,keyword,location,text,target,word_count,unique_words_count,Tweet_len,special_chars_count,hash_count,@_count,URL_count,sentiment,subjectivity
7272,10411,whirlwind,140920-21 & 150718-19 BEIJING,{INFO} Baekhyun and Suho will be attending the...,0,17,17,124,8,0,0,1,0.0,0.0


In [7]:
table = pd.pivot_table(train_df, index='keyword', columns='target',values='id', aggfunc='count')
table.reset_index(inplace=True)
table[0].fillna(0, inplace=True)
table[1].fillna(0, inplace=True) 
table['sum'] = (table[0] + table[1])
table['dis%'] = table[1]/(table[0] + table[1])
table.sample(5)

target,keyword,0,1,sum,dis%
9,army,29.0,5.0,34.0,0.147059
220,wrecked,36.0,3.0,39.0,0.076923
187,suicide%20bombing,1.0,32.0,33.0,0.969697
193,terrorist,8.0,23.0,31.0,0.741935
210,weapon,25.0,14.0,39.0,0.358974


In [8]:
train_df = train_df.merge(table[['keyword','dis%']], on='keyword', how='left')
train_df.sample()

Unnamed: 0,id,keyword,location,text,target,word_count,unique_words_count,Tweet_len,special_chars_count,hash_count,@_count,URL_count,sentiment,subjectivity,dis%
4467,6350,hostages,,'Well guess what young girls. You aren't damse...,0,20,18,133,12,0,0,1,0.1,0.4,0.72973


### drop duplicates

In [9]:
len(train_df)

7613

In [10]:
train_df.drop_duplicates(subset=['text'],inplace=True)
train_df.reset_index(drop=True,inplace=True)
len(train_df)

7503

### delete location column

In [11]:
train_df.drop(columns=['location'],inplace=True)

### replace NaN with 'no_keyword'

In [12]:
train_df.fillna('no_keyword', inplace=True)

### cleaning

In [13]:
#ToDo: how to write it correctly so it return df? 
#def clean_test(df,columns: list) -> DataFrame():

def clean_text(df,columns: list):    
    for item in columns:
        new_head = str(item)+'_clean'
        df[new_head] = df[item].apply(lambda x: str(x).lower())
        
        df[new_head] = df[new_head].replace(repl_dict, regex=True)
        df[new_head] = df[new_head].apply(lambda x: str(x).strip())
        df[new_head] = df[new_head].apply(lambda x: remove_http(x))
        df[new_head] = df[new_head].apply(lambda x: remove_stopwords(x))
        df[new_head] = df[new_head].apply(lambda x: text_stemmer(x))
        df[new_head] = df[new_head].apply(lambda x: re.sub("@[A-Za-z0-9]+","", x))
        
        slang_dict = pd.read_csv("./Data/twitterSlang.csv", encoding='utf-8')
        slang_dict = dict(zip(slang_dict["abbr"], slang_dict["full_word"]))
        df[new_head] = df[new_head].replace(slang_dict)
        
    return df
        

In [14]:
train_df = clean_text(train_df,['text','keyword'])
train_df.head(3)

Unnamed: 0,id,keyword,text,target,word_count,unique_words_count,Tweet_len,special_chars_count,hash_count,@_count,URL_count,sentiment,subjectivity,dis%,text_clean,keyword_clean
0,1,no_keyword,Our Deeds are the Reason of this #earthquake M...,1,13,13,69,1,1,0,0,0.0,0.0,no_keyword,deed reason #earthquak may allah forgiv,nokeyword
1,4,no_keyword,Forest fire near La Ronge Sask. Canada,1,7,7,38,1,0,0,0,0.1,0.4,no_keyword,forest fire near la rong sask. canada,nokeyword
2,5,no_keyword,All residents asked to 'shelter in place' are ...,1,22,20,133,3,0,0,0,-0.01875,0.3875,no_keyword,resid ask 'shelter place' notifi officers. eva...,nokeyword


In [15]:
train_df['newtext'] = train_df['text_clean'] + ' ' + train_df['keyword_clean'].apply(lambda x: x if x != 'nokeyword' else '')

In [16]:
train_df.sample(2)

Unnamed: 0,id,keyword,text,target,word_count,unique_words_count,Tweet_len,special_chars_count,hash_count,@_count,URL_count,sentiment,subjectivity,dis%,text_clean,keyword_clean,newtext
547,800,battle,YA BOY CLIP VS 4KUS FULL BATTLE\r\n\r\n@15Mofe...,0,14,14,137,12,0,6,1,0.35,0.55,0.192308,ya boy clip vs 4ku full battl,battl,ya boy clip vs 4ku full battl battl
3628,5238,fatality,08/3/15: CAT FATALITY: UTICA NY; PLEASANT &amp...,0,23,23,138,11,0,0,0,0.145833,0.341667,0.27027,08/3/15: cat fatality: utica ny; pleasant &amp...,fatal,08/3/15: cat fatality: utica ny; pleasant &amp...


In [17]:
train_df.to_csv(train_enriched_file, encoding='utf-8',index=False)

# <font color = 'dark green'> test df

In [18]:
test_df = pd.read_csv(test_file,encoding='utf-8')
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [19]:
test_df.drop(columns=['location'],inplace=True)

In [20]:
test_df.fillna('no_keyword', inplace=True)

In [21]:
test_df = clean_text(test_df,['text','keyword'])
test_df.head(3)

Unnamed: 0,id,keyword,text,text_clean,keyword_clean
0,0,no_keyword,Just happened a terrible car crash,happen terribl car crash,nokeyword
1,2,no_keyword,"Heard about #earthquake is different cities, s...","heard #earthquak differ cities, stay safe ever...",nokeyword
2,3,no_keyword,"there is a forest fire at spot pond, geese are...","forest fire spot pond, gees flee across street...",nokeyword


In [22]:
test_df['newtext'] = test_df['text_clean'] + ' ' + test_df['keyword_clean'].apply(lambda x: x if x != 'nokeyword' else '')

In [23]:
test_df.sample(2)

Unnamed: 0,id,keyword,text,text_clean,keyword_clean,newtext
3187,10586,wounded,Police Officer Wounded Suspect Dead After Exch...,polic offic wound suspect dead exchang shot,wound,polic offic wound suspect dead exchang shot wound
358,1158,blight,2:20 BLIGHT ?? EVERY DAY,2:20 blight ?? everi day,blight,2:20 blight ?? everi day blight


In [24]:
test_df.to_csv(test_enriched_file, encoding='utf-8',index=False)