In [None]:
#!pip install textblob

In [1]:
import pandas as pd
import os
import string
from textblob import TextBlob
from utils.cleaning_tools import *

In [2]:
train_file = './Data/train.csv'
train_enriched_file = './Data/train_enriched.csv'
special_char = '~:;}]{[!@#$%^&*()_+=-><,.|\/?\'\"'
hash_char = '#'
at_char = '@'


In [3]:
train_df = pd.read_csv(train_file,encoding='utf-8')
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


### add features

In [4]:
def add_features(df):
    df['word_count'] = df['text'].apply(lambda t: len(t.split()))
    df['unique_words_count'] = df['text'].apply(lambda t: len(set(t.split())))
    df['Tweet_len'] = df['text'].apply(lambda t: len(t))
    df['special_chars_count'] = df['text'].apply(lambda t: len([c for c in str(t) if c in special_char]))
    #df['punctuation_count'] = df['text'].apply(lambda t: len([c for c in str(t) if c in string.punctuation]))
    df['hash_count'] = df['text'].apply(lambda t: len([c for c in str(t) if c in hash_char]))
    df['@_count'] = df['text'].apply(lambda t: len([c for c in str(t) if c in at_char]))
    df['URL_count']  = df['text'].apply(lambda t: len([w for w in str(t).lower().split() if 'http' in w or 'https' in w]))
    df['sentiment'] = df['text'].apply(lambda t: TextBlob(t).sentiment.polarity)
    df['subjectivity'] = df['text'].apply(lambda t: TextBlob(t).sentiment.subjectivity)
    
    return df

In [5]:
# The polarity score is a float within the range [-1.0, 1.0], where -1 is very negative and +1 very positive
# The subjectivity is a float within the range [0.0, 1.0], where 0.0 is very objective and 1.0 is very subjective.

In [6]:
train_df = add_features(train_df)
train_df.sample()

Unnamed: 0,id,keyword,location,text,target,word_count,unique_words_count,Tweet_len,special_chars_count,hash_count,@_count,URL_count,sentiment,subjectivity
2578,3696,destroy,,@engineermataRAI ate mataas kc rating..but the...,0,21,19,126,3,0,1,0,-0.2,0.0


In [7]:
table = pd.pivot_table(train_df, index='keyword', columns='target',values='id', aggfunc='count')
table.reset_index(inplace=True)
table[0].fillna(0, inplace=True)
table[1].fillna(0, inplace=True) 
table['sum'] = (table[0] + table[1])
table['dis%'] = table[1]/(table[0] + table[1])
table.sample(5)

target,keyword,0,1,sum,dis%
96,evacuated,4.0,32.0,36.0,0.888889
197,tornado,19.0,16.0,35.0,0.457143
51,crash,21.0,12.0,33.0,0.363636
146,natural%20disaster,16.0,18.0,34.0,0.529412
46,collapse,19.0,15.0,34.0,0.441176


In [8]:
train_df = train_df.merge(table[['keyword','dis%']], on='keyword', how='left')
train_df.sample()

Unnamed: 0,id,keyword,location,text,target,word_count,unique_words_count,Tweet_len,special_chars_count,hash_count,@_count,URL_count,sentiment,subjectivity,dis%
4087,5808,hail,"Brasil, Fortaleza ce",Seen on Fahlo:#WCW All Hail the QueenåÊ?? http...,0,12,12,113,15,2,0,2,0.0,0.0,0.514286


### drop duplicates

In [9]:
len(train_df)

7613

In [10]:
train_df.drop_duplicates(subset=['text'],inplace=True)
train_df.reset_index(drop=True,inplace=True)
len(train_df)

7503

### delete location column

In [11]:
train_df.drop(columns=['location'],inplace=True)


### replace NaN with 'no_keyword'

In [12]:
train_df.fillna('no_keyword', inplace=True)

### cleaning

In [13]:
#ToDo: how to write it correctly so it return df? 
#def clean_test(df,columns: list) -> DataFrame():

def clean_text(df,columns: list):    
    for item in columns:
        new_head = str(item)+'_clean'
        df[new_head] = df[item].apply(lambda x: str(x).lower())
        
        df[new_head] = df[new_head].replace(repl_dict, regex=True)
        df[new_head] = df[new_head].apply(lambda x: str(x).strip())
        df[new_head] = df[new_head].apply(lambda x: remove_http(x))
        df[new_head] = df[new_head].apply(lambda x: remove_stopwords(x))
        df[new_head] = df[new_head].apply(lambda x: text_stemmer(x))
        
    return df
        

In [14]:
train_df = clean_text(train_df,['text','keyword'])
train_df.head(3)

Unnamed: 0,id,keyword,text,target,word_count,unique_words_count,Tweet_len,special_chars_count,hash_count,@_count,URL_count,sentiment,subjectivity,dis%,text_clean,keyword_clean
0,1,no_keyword,Our Deeds are the Reason of this #earthquake M...,1,13,13,69,1,1,0,0,0.0,0.0,no_keyword,deed reason #earthquak may allah forgiv,no_keyword
1,4,no_keyword,Forest fire near La Ronge Sask. Canada,1,7,7,38,1,0,0,0,0.1,0.4,no_keyword,forest fire near la rong sask. canada,no_keyword
2,5,no_keyword,All residents asked to 'shelter in place' are ...,1,22,20,133,3,0,0,0,-0.01875,0.3875,no_keyword,resid ask 'shelter place' notifi officers. eva...,no_keyword


In [15]:
train_df.to_csv(train_enriched_file, encoding='utf-8',index=False)