### Imports

In [23]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, ParameterGrid
from nltk.tokenize import word_tokenize
from sklearn.utils import shuffle
import pandas as pd
from copy import deepcopy
import os
import json
import random
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

from functions import load_data, query_dataframe

### Define Variables

In [2]:
global_shuffle_seed = 32
global_debug=False
global_override=False

### Load Data

In [3]:
result, clf_result = {}, {}
df_data = load_data()

In [4]:
data_true = query_dataframe(df_data, {'is_flood':True})
data_false = query_dataframe(df_data, {'is_flood':False})
print('Total:',len(df_data),'True:',len(data_true), 'False:',len(data_false))

Total: 608 True: 63 False: 542


In [5]:
def make_data_ratio(df_data, ratio=0, shuffle_seed=4, debug=False, save_folder=None, load_folder=None, override=False, file_prefix=''):
    save_file, load_file=None, None
    if save_folder: save_file = os.path.join(save_folder,file_prefix+'all.json') if ratio==0 else os.path.join(save_folder,file_prefix+'1-{}.json'.format(ratio))
    if load_folder: load_file = os.path.join(load_folder,file_prefix+'all.json') if ratio==0 else os.path.join(load_folder,file_prefix+'1-{}.json'.format(ratio))

    if not override and load_file and os.path.isfile(load_file):
        if debug: print('loaded',load_file)
        js = json.load(open(load_file))
        train_rdf = pd.DataFrame(js['train'])
        test_rdf = pd.DataFrame(js['test'])
        return {'train':train_rdf, 'test':test_rdf}
    new_df = df_data.sample(n=len(df_data), random_state=shuffle_seed).reset_index(drop=True)
    
    if not ratio: total_data = new_df
    else:
        true_data = [dict(row[1]) for row in new_df.iterrows() if row[1]['is_flood']]
        false_data = [dict(row[1]) for row in new_df.iterrows() if not row[1]['is_flood']]
        true_nums = len(true_data)
        false_nums = min(true_nums * ratio, len(false_data))
        if false_nums==len(false_data) and debug: print('All false data points taken')
        total_data = true_data + [false_data[i] for i in range(0,false_nums)]
    train, test = train_test_split(total_data, test_size=0.2, random_state=shuffle_seed)
    train_rdf = pd.DataFrame(train)
    test_rdf = pd.DataFrame(test)
    if debug: print('data-ratio',ratio)

    if save_file:
        train_json = train_rdf.to_json(orient='records')
        test_json = test_rdf.to_json(orient='records')
        json.dump({'train':json.loads(train_json), 'test':json.loads(test_json)}, open(save_file,'w'), indent=2)
    return {'train':train_rdf, 'test':test_rdf}


In [6]:
df_data = load_data()
result, clf_result = {}, {}
save_data_folder = 'data_splits/'
load_data_folder = 'data_splits/'
if not os.path.isdir(save_data_folder): os.mkdir(save_data_folder)
debug=global_debug or True
override=global_override or False
data = {
        'all': make_data_ratio(df_data, save_folder=save_data_folder, load_folder=load_data_folder, 
                               debug=debug, shuffle_seed=global_shuffle_seed, override=override),
        '1:1': make_data_ratio(df_data,1, save_folder=save_data_folder, load_folder=load_data_folder, 
                               debug=debug, shuffle_seed=global_shuffle_seed, override=override),
        '1:2': make_data_ratio(df_data,2, save_folder=save_data_folder, load_folder=load_data_folder, 
                               debug=debug, shuffle_seed=global_shuffle_seed, override=override),
        '1:3': make_data_ratio(df_data,3, save_folder=save_data_folder, load_folder=load_data_folder, 
                               debug=debug, shuffle_seed=global_shuffle_seed, override=override),
        '1:5': make_data_ratio(df_data,5, save_folder=save_data_folder, load_folder=load_data_folder, 
                               debug=debug, shuffle_seed=global_shuffle_seed, override=override)
}


loaded data_splits/all.json
loaded data_splits/1-1.json
loaded data_splits/1-2.json
loaded data_splits/1-3.json
loaded data_splits/1-5.json


### Classifier

In [7]:
def make_data(vect_fit, ratio):
    train, test = ratio.get('train',None), ratio.get('test',None)
    if train is None or test is None: raise Exception('Train or Test data not found')
    all_X = list(train['text']) + list(test['text'])
    
    vect = vect_fit.fit(all_X)
    trainX, testX = vect.transform(list(train['text'])), vect.transform((test['text']))
    trainY, testY = [1 if t else 0 for t in train['is_flood']], [1 if t else 0 for t in test['is_flood']]
    return trainX, testX, trainY, testY, vect


In [8]:
def run_classifier(clf, trainX, testX, trainY, testY):
    clf_fit = clf.fit(trainX, trainY)
    clf_pred = clf_fit.predict(testX)
    clf_acc = accuracy_score(testY, clf_pred)
    return clf_fit, clf_pred, clf_acc


In [9]:
def get_method(main_d, name):
    if name not in main_d: raise Exception('Cannot find classifier/feature_extractor name in parameter dictionary')
    d = main_d[name]
    method = d.get('method',None)
    base_method = d.get('base_method',None)
    if method and base_method: raise Exception('Cannot have method and base method both.')
    if not method and not base_method: raise Exception('Unable to parse the method from classifier/feature_extractor')
    params = d.get('params',None)
    if method:
        if params: return method, params
        else: return method, None
    if base_method:
        prev_method, prev_params = get_method(main_d, base_method)
        if params:
            for k,v in params.items(): prev_params[k] = v
        return prev_method, prev_params

def make_method(main_d, name, override_params={}):
    method, params = get_method(main_d, name)[:]
    if override_params:
        for k,v in override_params.items(): params[k] = v
    if params: return method(**params)
    else: return method()


In [10]:
def run_grid(grid, data, feature_extract, classifiers, clf_result, result, 
             debug=False, override=False, save_folder=None, load_folder=None, file_prefix=''):
    save_clf_result = {}
    if load_folder:
        res_file = os.path.join(load_folder,file_prefix+'clf_result.json')
        clf_res_file = os.path.join(load_folder,file_prefix+'result.json')
        if os.path.isfile(res_file): clf_result=json.load(open(res_file))
        if os.path.isfile(clf_res_file): result=json.load(open(clf_res_file))
        if os.path.isfile(res_file) and os.path.isfile(clf_res_file) and debug: print('loaded result')
    
    if override:
        clf_result, result = {}, {}
        if debug: print('OVERRIDE')
    for g in list(grid):
        try:
            data_name = g.get('data',None)
            feature_name = g.get('feature_extract',None)
            clf_name = g.get('classifier', None)
            if not feature_name or not clf_name or not data_name:
                raise Exception('Data Ratio, Feature Extract and Classifier Name required')
            result_key = data_name + '-' + feature_name + '-' + clf_name
            if result.get(result_key): continue
            if debug: print('Data Ratio:',data_name, '  Feature Name:', \
                            feature_name, '  Clasifier Name:',clf_name, '  Key:',result_key)
                
            ratio = data.get(data_name, None)
            if ratio is None: raise Exception('Cannot find data ratio')
            
            feature = make_method(feature_extract, feature_name)
            trainX, testX, trainY, testY, feature2 = make_data(feature, ratio)
            
            clf = make_method(classifiers, clf_name)
            clf_fit, clf_pred, clf_acc = run_classifier(clf, trainX, testX, trainY, testY)
            
            result[result_key] = {
                'data_ratio': data_name,
                'feature_extract': feature_name,
                'classifier': clf_name,
                'accuracy': clf_acc
            }
            
            clf_result[result_key] = {
                'data_ratio': data_name,
                'feature_extract': feature_name,
                'classifier': clf_name,
                'clf': clf_fit,
                'feature': feature2,
                'predict': clf_pred
            }
            
            save_clf_result[result_key] = {
                'data_ratio': data_name,
                'feature_extract': feature_name,
                'classifier': clf_name,
                'predict': clf_pred.tolist()
            }  
        except Exception as e:
            print('Error:',e)
            continue
    if save_folder:
        json.dump(save_clf_result, open(os.path.join(load_folder,file_prefix+'clf_result.json'),'w'), indent=2)
        json.dump(result, open(os.path.join(load_folder,file_prefix+'result.json'),'w'), indent=2)
    return clf_result, result


In [11]:
def parse_result(result, accuracy_threshold=None):
    keys = list(result.keys())
    temp_df = pd.DataFrame(list(result.values()))
    temp_df['keys'] = keys
    data_vals = list(set(list(temp_df['data_ratio'])))
    if 'all' in data_vals:
        data_vals.remove('all')
        data_vals.sort(key=lambda x:int(x.split(':')[1]))
        data_vals.append('all')
    else:
        data_vals.sort(key=lambda x:int(x.split(':')[1]))
    for d in data_vals:
        if accuracy_threshold:
            new_df = temp_df.loc[temp_df['data_ratio']==d]
            new_df = new_df.loc[new_df['accuracy']>accuracy_threshold] \
                            .drop('data_ratio', axis=1) \
                            .set_index('feature_extract').sort_index()
        else:
            new_df = temp_df.loc[temp_df['data_ratio']==d] \
                            .drop('data_ratio', axis=1) \
                            .set_index('feature_extract').sort_index()
        print('Data Ratio:',d)
        print(new_df.to_markdown())
        print()

In [32]:
def compare_result(clf_result, data, method_name, conf_matrix=True):
    if method_name not in clf_result: raise Exception('Cannot find method')
    res = clf_result[method_name]
    new_df = data[res['data_ratio']]['test']
    actual = list(new_df['is_flood'])
    new_df['predict'] = res['predict']
    if conf_matrix: print(confusion_matrix(actual, res['predict']))
    return new_df
    

In [13]:
feature_extract = {
    'CountVect': {
        'classifier_type': 'Count Vectorizer',
        'method': CountVectorizer,
        'params': {
            'tokenizer': word_tokenize,
            'stop_words': 'english'
        }
    },
    'CountVect-min_df5':{
        'base_method': 'CountVect',
        'params':{
            'min_df': 5
        }
    },
    'CountVect-min_df10':{
        'base_method': 'CountVect',
        'params':{
            'min_df': 10
        }
    },
    'TFIDF': {
        'classifier_type': 'TFIDF',
        'method': TfidfVectorizer,
        'params': {
            'tokenizer': word_tokenize,
            'stop_words': 'english'
        }
    },
    'TFIDF-min_df5':{
        'base_method': 'TFIDF',
        'params':{
            'min_df': 5
        }
    },
    'TFIDF-min_df10':{
        'base_method': 'TFIDF',
        'params':{
            'min_df': 10
        }
    },
}

classifiers = {
    'DecisionTree': {
        'classifier_type':'Decision Tree',
        'method': DecisionTreeClassifier,
    },
    'RandomForest': {
        'classifier_type':'Random Forest ',
        'method': RandomForestClassifier,
    },
    'LinearSVC': {
        'classifier_type': 'Linear SVC',
        'method': LinearSVC
    }
}

grid_parameters = {
    'data': list(data.keys()),
    'feature_extract': list(feature_extract.keys()),
    'classifier': list(classifiers.keys()),
}
grid = ParameterGrid(grid_parameters)

In [14]:
override=global_override or False
debug=True
save_results_folder = 'results/'
load_results_folder = 'results/'
if not os.path.isdir(save_results_folder): os.mkdir(save_results_folder)
clf_result, result = run_grid(grid, data, feature_extract, classifiers, clf_result, result, 
                              debug=debug, override=override, save_folder=save_results_folder, 
                             load_folder=load_results_folder)


loaded result


In [15]:
parse_result(result)

Data Ratio: 1:1
| feature_extract    | classifier   |   accuracy | keys                                |
|:-------------------|:-------------|-----------:|:------------------------------------|
| CountVect          | DecisionTree |   0.769231 | 1:1-CountVect-DecisionTree          |
| CountVect          | LinearSVC    |   0.730769 | 1:1-CountVect-LinearSVC             |
| CountVect          | RandomForest |   0.961538 | 1:1-CountVect-RandomForest          |
| CountVect-min_df10 | LinearSVC    |   0.730769 | 1:1-CountVect-min_df10-LinearSVC    |
| CountVect-min_df10 | RandomForest |   0.961538 | 1:1-CountVect-min_df10-RandomForest |
| CountVect-min_df10 | DecisionTree |   0.923077 | 1:1-CountVect-min_df10-DecisionTree |
| CountVect-min_df5  | RandomForest |   0.923077 | 1:1-CountVect-min_df5-RandomForest  |
| CountVect-min_df5  | LinearSVC    |   0.730769 | 1:1-CountVect-min_df5-LinearSVC     |
| CountVect-min_df5  | DecisionTree |   0.769231 | 1:1-CountVect-min_df5-DecisionTree  |
| TFI

In [16]:
parse_result(result, 0.92)

Data Ratio: 1:1
| feature_extract    | classifier   |   accuracy | keys                                |
|:-------------------|:-------------|-----------:|:------------------------------------|
| CountVect          | RandomForest |   0.961538 | 1:1-CountVect-RandomForest          |
| CountVect-min_df10 | DecisionTree |   0.923077 | 1:1-CountVect-min_df10-DecisionTree |
| CountVect-min_df10 | RandomForest |   0.961538 | 1:1-CountVect-min_df10-RandomForest |
| CountVect-min_df5  | RandomForest |   0.923077 | 1:1-CountVect-min_df5-RandomForest  |
| TFIDF              | RandomForest |   0.923077 | 1:1-TFIDF-RandomForest              |
| TFIDF-min_df10     | RandomForest |   0.923077 | 1:1-TFIDF-min_df10-RandomForest     |
| TFIDF-min_df5      | RandomForest |   0.961538 | 1:1-TFIDF-min_df5-RandomForest      |

Data Ratio: 1:2
| feature_extract   | classifier   |   accuracy | keys                            |
|:------------------|:-------------|-----------:|:--------------------------------

In [35]:
compare_result(clf_result, data, '1:1-TFIDF-min_df5-RandomForest')

[[11  0]
 [ 1 14]]


Unnamed: 0,doc_id,filename,is_flood,is_bangladesh,flood_related,flood_climatechange,newspaper,flood_type,text,predict
0,,a4fskUJKFHYektuFIx5ZETIrjRma-1987_acde84fb87df...,True,True,True,False,ny_times,cyclone,PRIEST IS HONORED FOR WORK IN ASIA\n1987-08-16...,0
1,,araLJi81IPOJ7f9GMEz9YK7sy3uS-1989_79269b1b983f...,False,False,False,False,ny_times,,Key Sections of the Paris Communique by the Gr...,0
2,83244a0f-4ba2-4097-ad62-8f0bf3852cb9,arseC9cfb.zjk0gJlJCv.91O1d4K-dhakaTribune_data...,True,True,True,False,dhaka_tribune,cyclone,Date Published:2020-04-13 00:00:00 \nLoca...,1
3,566e61fe-266f-42ca-8923-338a55b4ee5c,a7E2o724RaTj5PcoX.vKCdTk7IrW-dhakaTribune_data...,False,True,False,False,dhaka_tribune,,Date Published:2020-01-18 00:00:00 \nHund...,0
4,ae67dbcb-80eb-4fbf-a478-5a2ec8bc445c,aeBu7Vd4bXNRkEEGmvQ7aFG4oS0C-theDailyStar_data...,False,True,True,False,daily_star,,Date Published:2019-08-02 00:00:00 \nDeng...,0
5,,a7dCFrpObpJaEgRbcD8kZSrysfLO-1988_1627fbc1b3da...,True,True,True,False,ny_times,,Bangladeshi Asks for Help on Controlling Flood...,1
6,1c428e80-a9d1-4a26-9369-f4930f4858a9,aE92tOgn3iGqpz4z0TMtBhPeuoAi-theDailyStar_data...,True,True,True,False,daily_star,cyclone,Date Published:2019-05-03 00:00:00 \nAt l...,1
7,,au37fJC_PlfQaFbYQ3YOYEZivxYu-1988_4730bf7e1391...,True,True,True,False,ny_times,monsoon,Bangladesh Flood Toll Up to 680 in Monsoon\n19...,1
8,,akxy7trkrzXGd.GjQ6hQhCEUdJXy-1988_0dd4a99cd1a7...,True,True,True,False,ny_times,monsoon,"In Bangladesh, Too Much Water and Not Enough\n...",1
9,c27d553b-857a-4442-b2e6-c65d2723cbdf,akoXaeBgB_DDXJTULUMMZDCjkKYO-dhakaTribune_data...,True,True,True,False,dhaka_tribune,river_erosion,Date Published:2019-10-06 00:00:00 \nIn M...,1


### Classifier Random Forest

In [18]:
RF_clf_result, RF_result = {},{}

In [19]:
RF_feature_extract = {
    'CountVect': {
        'classifier_type': 'Count Vectorizer',
        'method': CountVectorizer,
        'params': {
            'tokenizer': word_tokenize,
            'stop_words': 'english'
        }
    },
    'CountVect-min_df5':{
        'base_method': 'CountVect',
        'params':{
            'min_df': 5
        }
    },
    'CountVect-min_df10':{
        'base_method': 'CountVect',
        'params':{
            'min_df': 10
        }
    },
    'CountVect-ngram2':{
        'base_method': 'CountVect',
        'params':{
            'ngram_range': (2,2)
        }
    },
    'CountVect-ngram3':{
        'base_method': 'CountVect',
        'params':{
            'ngram_range': (3,3)
        }
    },
    'CountVect-min_df5-ngram2':{
        'base_method': 'CountVect',
        'params':{
            'ngram_range': (2,2),
            'min_df': 5
        }
    },
    'CountVect-min_df5-ngram3':{
        'base_method': 'CountVect',
        'params':{
            'ngram_range': (3,3),
            'min_df': 5
        }
    },
    'CountVect-min_df10-ngram2':{
        'base_method': 'CountVect',
        'params':{
            'ngram_range': (2,2),
            'min_df': 10
        }
    },
    'CountVect-min_df10-ngram3':{
        'base_method': 'CountVect',
        'params':{
            'ngram_range': (3,3),
            'min_df': 10
        }
    },
    'TFIDF': {
        'classifier_type': 'TFIDF',
        'method': TfidfVectorizer,
        'params': {
            'tokenizer': word_tokenize,
            'stop_words': 'english'
        }
    },
    'TFIDF-min_df5':{
        'base_method': 'TFIDF',
        'params':{
            'min_df': 5
        }
    },
    'TFIDF-min_df10':{
        'base_method': 'TFIDF',
        'params':{
            'min_df': 10
        }
    },
}

RF_classifiers = {
    'RandomForest': {
        'classifier_type':'Random Forest ',
        'method': RandomForestClassifier,
    },
}

RF_grid_parameters = {
    'data': ['1:1', '1:2', '1:3', '1:5'],
    'feature_extract': list(RF_feature_extract.keys()),
    'classifier': list(RF_classifiers.keys()),
}
RF_grid = ParameterGrid(RF_grid_parameters)

In [20]:
override=global_override or False
debug=True
save_results_folder = 'results/'
load_results_folder = 'results/'
if not os.path.isdir(save_results_folder): os.mkdir(save_results_folder)
RF_clf_result, RF_result = run_grid(RF_grid, data, RF_feature_extract, RF_classifiers, RF_clf_result, RF_result, 
                              debug=debug, override=override, save_folder=save_results_folder, 
                             load_folder=load_results_folder, file_prefix='RandomForest-')


loaded result


In [21]:
parse_result(RF_result)

Data Ratio: 1:1
| feature_extract           | classifier   |   accuracy | keys                                       |
|:--------------------------|:-------------|-----------:|:-------------------------------------------|
| CountVect                 | RandomForest |   0.961538 | 1:1-CountVect-RandomForest                 |
| CountVect-min_df10        | RandomForest |   0.961538 | 1:1-CountVect-min_df10-RandomForest        |
| CountVect-min_df10-ngram2 | RandomForest |   0.846154 | 1:1-CountVect-min_df10-ngram2-RandomForest |
| CountVect-min_df10-ngram3 | RandomForest |   0.769231 | 1:1-CountVect-min_df10-ngram3-RandomForest |
| CountVect-min_df5         | RandomForest |   0.961538 | 1:1-CountVect-min_df5-RandomForest         |
| CountVect-min_df5-ngram2  | RandomForest |   0.807692 | 1:1-CountVect-min_df5-ngram2-RandomForest  |
| CountVect-min_df5-ngram3  | RandomForest |   0.615385 | 1:1-CountVect-min_df5-ngram3-RandomForest  |
| CountVect-ngram2          | RandomForest |   0.846154 |

In [22]:
parse_result(RF_result, 0.92)

Data Ratio: 1:1
| feature_extract    | classifier   |   accuracy | keys                                |
|:-------------------|:-------------|-----------:|:------------------------------------|
| CountVect          | RandomForest |   0.961538 | 1:1-CountVect-RandomForest          |
| CountVect-min_df10 | RandomForest |   0.961538 | 1:1-CountVect-min_df10-RandomForest |
| CountVect-min_df5  | RandomForest |   0.961538 | 1:1-CountVect-min_df5-RandomForest  |
| TFIDF              | RandomForest |   0.961538 | 1:1-TFIDF-RandomForest              |
| TFIDF-min_df10     | RandomForest |   0.961538 | 1:1-TFIDF-min_df10-RandomForest     |
| TFIDF-min_df5      | RandomForest |   0.961538 | 1:1-TFIDF-min_df5-RandomForest      |

Data Ratio: 1:2
| feature_extract   | classifier   |   accuracy | keys                            |
|:------------------|:-------------|-----------:|:--------------------------------|
| TFIDF             | RandomForest |   0.921053 | 1:2-TFIDF-RandomForest          |
| T