In [107]:
import pandas as pd
from fuzzywuzzy import fuzz
import re



In [108]:
train = pd.read_csv("data/train_pre_processing.csv")
test = pd.read_csv("data/test_pre_processing.csv")

In [109]:
train['text_contain_keyword'] = train.text_contain_keyword.fillna(False).astype('bool')
test['text_contain_keyword'] = test.text_contain_keyword.fillna(False).astype('bool')

In [110]:
def string_contain_word_string(s,ws):
    if pd.isna(ws):
        return False
    for w1 in ws.lower().split(" "):
        for w2 in w1.split(","):
            for w3 in w2.split(":"):
                for w in w3.split("."):
                    if len(w) > 3 and w in s.lower():
                        return True
    return False

train['text_contain_word_location'] = train.apply(lambda x: string_contain_word_string(x.text,x.location), axis = 1)
test['text_contain_word_location'] = test.apply(lambda x: string_contain_word_string(x.text,x.location), axis = 1)

In [111]:
train['len_location_cero_default'] = train.location.transform(lambda x: 0 if pd.isna(x) else len(x))
test['len_location_cero_default'] = test.location.transform(lambda x: 0 if pd.isna(x) else len(x))

len_location_mean = int(train.loc[train.len_location_cero_default != 0,'len_location_cero_default'].mean())

train['len_location_mean_default'] = train.len_location_cero_default.replace(0, len_location_mean)
test['len_location_mean_default'] = test.len_location_cero_default.replace(0, len_location_mean)

train['total_words_location_cero_default'] = train.location.transform(lambda x: 0 if pd.isna(x) else len(x.split(" ")))
test['total_words_location_cero_default'] = test.location.transform(lambda x: 0 if pd.isna(x) else len(x.split(" ")))

total_words_location_mean = int(train.loc[train.total_words_location_cero_default != 0,'total_words_location_cero_default'].mean())

train['total_words_location_mean_default'] = train.total_words_location_cero_default.replace(0, total_words_location_mean)
test['total_words_location_mean_default'] = test.total_words_location_cero_default.replace(0, total_words_location_mean)



In [112]:
def text_contain_similarity_word(x,y):
    if pd.isna(y):
        return False
    
    for w in x.lower().split(" "):
        if fuzz.ratio(w,y.lower()) > 80:
            return True
    return False 

train['text_contain_keyword_similarity'] = train.apply(lambda x: text_contain_similarity_word(x.text,x.keyword_grouped), axis = 1)
test['text_contain_keyword_similarity'] = test.apply(lambda x: text_contain_similarity_word(x.text,x.keyword_grouped), axis = 1)



In [113]:
train['text_similarity_keyword'] = train.apply(lambda x: 0 if pd.isna(x.keyword_grouped) else fuzz.ratio(x.text.lower(),x.keyword_grouped.lower()), axis = 1)
test['text_similarity_keyword'] = test.apply(lambda x: 0 if pd.isna(x.keyword_grouped) else fuzz.ratio(x.text.lower(),x.keyword_grouped.lower()), axis = 1)



In [114]:
def get_best_similarity(s,w):
    similarity = 0
    
    for x in s.split(" "):
        if fuzz.ratio(x,w) > similarity:
            similarity = fuzz.ratio(x,w)
    return similarity

train['text_best_similarity_keyword'] = train.apply(lambda x: 0 if pd.isna(x.keyword_grouped) else get_best_similarity(x.text.lower(),x.keyword_grouped.lower()), axis = 1)
test['text_best_similarity_keyword'] = test.apply(lambda x: 0 if pd.isna(x.keyword_grouped) else get_best_similarity(x.text.lower(),x.keyword_grouped.lower()), axis = 1)


In [115]:
train['text_similarity_location'] = train.apply(lambda x: 0 if pd.isna(x.location) else fuzz.ratio(x.text.lower(),x.location.lower()), axis = 1)
test['text_similarity_location'] = test.apply(lambda x: 0 if pd.isna(x.location) else fuzz.ratio(x.text.lower(),x.location.lower()), axis = 1)


In [116]:
train['text_best_similarity_location'] = train.apply(lambda x: 0 if pd.isna(x.location) else get_best_similarity(x.text.lower(),x.location.lower()), axis = 1)
test['text_best_similarity_location'] = test.apply(lambda x: 0 if pd.isna(x.location) else get_best_similarity(x.text.lower(),x.location.lower()), axis = 1)


In [117]:
train['ratio_short_big_words']  = train['total_3_orless_words'] / train['total_5_ormore_words'].replace(0,1)
test['ratio_short_big_words']  = test['total_3_orless_words'] / test['total_5_ormore_words'].replace(0,1)

In [118]:
train.drop(columns=['subjectivity_text', 'text_best_similarity_location', 'text_similarity_location', 'total_4_words', 'total_4_ormore_words', 'total_7_words', 'total_4_orless_words', 'len_location_mean_default'],inplace = True)
test.drop( columns=['subjectivity_text', 'text_best_similarity_location', 'text_similarity_location', 'total_4_words', 'total_4_ormore_words', 'total_7_words', 'total_4_orless_words', 'len_location_mean_default'],inplace = True)


In [119]:
words_dict = {}
for x in train.loc[:,['text','target']].iterrows():
    for word in re.split(' |\'|\*|\n|:|#|@|-|\?|\.|,|[|]|!|¡',x[1]['text']):
        word = word.lower()
        if len(word) < 4:
            continue
        if not word in words_dict:
            words_dict[word] = [0,0]
        if x[1]['target'] == 1:
            words_dict[word][0] = words_dict[word][0] + 1
        else:
            words_dict[word][1] = words_dict[word][1] + 1

In [120]:
words_df = pd.DataFrame(words_dict,index=['total_target_true','total_target_false']).transpose()
words_df = words_df.loc[(words_df.total_target_true + words_df.total_target_false) > 10]

In [121]:
words_100_true = []
words_100_false = []
words_90_true = []
words_90_false = []
words_85_true = []
words_85_false = []
words_80_true = []
words_80_false = []
words_75_true = []
words_75_false = []
words_70_true = []
words_70_false = []


for word in words_df.iterrows():
    false = word[1]['total_target_false']
    true = word[1]['total_target_true']
    
    if true == 0:
        words_100_false.append(word[0])
        
    if false == 0:
        words_100_true.append(word[0])
        
    if true / (true + false) >= 0.9:
        words_90_true.append(word[0])

    if false / (true + false) >= 0.9:
        words_90_false.append(word[0])
        
    if true / (true + false) >= 0.85:
        words_85_true.append(word[0])

    if false / (true + false) >= 0.85:
        words_85_false.append(word[0])
        
    if true / (true + false) >= 0.8:
        words_80_true.append(word[0])

    if false / (true + false) >= 0.8:
        words_80_false.append(word[0])

    if true / (true + false) >= 0.75:
        words_75_true.append(word[0])

    if false / (true + false) >= 0.75:
        words_75_false.append(word[0])
        
    if true / (true + false) >= 0.7:
        words_70_true.append(word[0])

    if false / (true + false) >= 0.7:
        words_70_false.append(word[0])

In [122]:
def text_contain_word_list(s,l):
    for word in l:
        if word.lower() in s.lower():
            return True
    return False

In [123]:
train['contain_words_100_true'] = train.text.transform(lambda x: text_contain_word_list(x,words_100_true))
train['contain_words_100_false'] = train.text.transform(lambda x: text_contain_word_list(x,words_100_false))
train['contain_words_90_true'] = train.text.transform(lambda x: text_contain_word_list(x,words_90_true))
train['contain_words_90_false'] = train.text.transform(lambda x: text_contain_word_list(x,words_90_false))
train['contain_words_85_true'] = train.text.transform(lambda x: text_contain_word_list(x,words_85_true))
train['contain_words_85_false'] = train.text.transform(lambda x: text_contain_word_list(x,words_85_false))
train['contain_words_80_true'] = train.text.transform(lambda x: text_contain_word_list(x,words_80_true))
train['contain_words_80_false'] = train.text.transform(lambda x: text_contain_word_list(x,words_80_false))
train['contain_words_75_true'] = train.text.transform(lambda x: text_contain_word_list(x,words_75_true))
train['contain_words_75_false'] = train.text.transform(lambda x: text_contain_word_list(x,words_75_false))
train['contain_words_70_true'] = train.text.transform(lambda x: text_contain_word_list(x,words_70_true))
train['contain_words_70_false'] = train.text.transform(lambda x: text_contain_word_list(x,words_70_false))

test['contain_words_100_true'] = test.text.transform(lambda x: text_contain_word_list(x,words_100_true))
test['contain_words_100_false'] = test.text.transform(lambda x: text_contain_word_list(x,words_100_false))
test['contain_words_90_true'] = test.text.transform(lambda x: text_contain_word_list(x,words_90_true))
test['contain_words_90_false'] = test.text.transform(lambda x: text_contain_word_list(x,words_90_false))
test['contain_words_85_true'] = test.text.transform(lambda x: text_contain_word_list(x,words_85_true))
test['contain_words_85_false'] = test.text.transform(lambda x: text_contain_word_list(x,words_85_false))
test['contain_words_80_true'] = test.text.transform(lambda x: text_contain_word_list(x,words_80_true))
test['contain_words_80_false'] = test.text.transform(lambda x: text_contain_word_list(x,words_80_false))
test['contain_words_75_true'] = test.text.transform(lambda x: text_contain_word_list(x,words_75_true))
test['contain_words_75_false'] = test.text.transform(lambda x: text_contain_word_list(x,words_75_false))
test['contain_words_70_true'] = test.text.transform(lambda x: text_contain_word_list(x,words_70_true))
test['contain_words_70_false'] = test.text.transform(lambda x: text_contain_word_list(x,words_70_false))

In [124]:
train.drop(columns=['len_text', 'contain_words_75_false', 'total_7_ormore_words', 'contain_words_100_false', 'total_common_chars', 'total_7_orless_words'],inplace = True)
test.drop( columns=['len_text', 'contain_words_75_false', 'total_7_ormore_words', 'contain_words_100_false', 'total_common_chars', 'total_7_orless_words'],inplace = True)



In [125]:
target = train.target
train.drop(columns=['target'], inplace=True)
train['target'] = target
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 47 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   keyword                            7552 non-null   object 
 1   location                           5080 non-null   object 
 2   text                               7613 non-null   object 
 3   keyword_grouped                    7552 non-null   object 
 4   text_contain_keyword               7613 non-null   bool   
 5   total_words                        7613 non-null   int64  
 6   total_upper_chars                  7613 non-null   int64  
 7   total_numbers_chars                7613 non-null   int64  
 8   total_special_chars                7613 non-null   int64  
 9   contain_question                   7613 non-null   bool   
 10  contain_link                       7613 non-null   bool   
 11  contain_hashtag                    7613 non-null   bool 

In [126]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 47 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   id                                 3263 non-null   int64  
 1   keyword                            3237 non-null   object 
 2   location                           2158 non-null   object 
 3   text                               3263 non-null   object 
 4   keyword_grouped                    3237 non-null   object 
 5   text_contain_keyword               3263 non-null   bool   
 6   total_words                        3263 non-null   int64  
 7   total_upper_chars                  3263 non-null   int64  
 8   total_numbers_chars                3263 non-null   int64  
 9   total_special_chars                3263 non-null   int64  
 10  contain_question                   3263 non-null   bool   
 11  contain_link                       3263 non-null   bool 

In [128]:
#train.to_csv('data/train_pre_processing_3.csv', index=False)
#test.to_csv('data/test_pre_processing_3.csv', index=False)