In [1]:
# @TODO: try this for y-values
# sometimes this is called “non-parametric” in classical statistics
# rank-based /percentile cutoff

In [1]:
from sqlalchemy import create_engine
import pandas as pd
import numpy as np

from sklearn.cross_validation import train_test_split, StratifiedShuffleSplit
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
                            accuracy_score, 
                            precision_score, 
                            recall_score, 
                            f1_score, 
                            roc_curve, 
                            roc_auc_score, 
                            confusion_matrix
                            )
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.learning_curve import learning_curve
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import confusion_matrix

from unbalanced_dataset import OverSampler, SMOTE

import matplotlib.pyplot as plt
# import seaborn as sns

import statsmodels.api as sm
from statsmodels.tsa.ar_model import AR, ARResults
from scipy.misc import factorial as fact

%matplotlib inline

In [2]:
CONFIG = {}

with open('config.txt', "r") as in_file:
    for line in in_file:
        line = line.split(":")
        parameter = line[0].strip()
        value = line[1].strip()
        CONFIG[parameter] = value
                
engine_path = 'postgresql://'+CONFIG['username']+':'+\
                CONFIG['password']+'@'+CONFIG['ip']+\
                ':'+CONFIG['port']+'/'+CONFIG['database']
        
engine = create_engine(engine_path)

##Want something like:

In [3]:
def makeSQLQuery(table_names, year, seriesnames):
    dotyyear = '.y' + str(year)
    short = table_names[0][0:2]
    seriesnames = seriesnames
    abb = [ser[0:2] for ser in seriesnames]
    query = 'SELECT x.*, value as refugees FROM (' + \
            'SELECT ' + short + '.countrycode AS countrycode, ' + \
            short + dotyyear + ' AS ' + table_names[0] 
    
    mystr = ''
    for s in seriesnames:
        mystr = mystr + ', env.' + s
    mystr = mystr
    
    query = query + mystr
    
    query = query + ' FROM (SELECT countrycode, ' + dotyyear[1:] + \
        ' FROM ' + table_names[0] + ') ' + short + ' JOIN ' + \
        '(SELECT ' + abb[0] + '.countrycode AS countrycode'
    
    mystr = ''
    for a, ser in zip(abb, seriesnames):
        mystr = mystr + ', ' + a + dotyyear + ' as ' + ser
    query = query + mystr
    
    query = query + ' FROM ('
    
    subquery = []
    for a, ser in zip(abb, seriesnames):
        mystr = ''
        mystr = mystr + '(SELECT countrycode, ' + dotyyear[1:] + \
        ' FROM ' + table_names[1] + " WHERE series='" + ser + \
        "') " + a
        subquery.append(mystr)
    
    query = query + subquery[0] + ' JOIN ' + subquery[1] + ' ON ' + \
        abb[0] + '.countrycode = ' + abb[1] + '.countrycode' + \
        ' JOIN ' + subquery[2] + ' ON ' + \
        abb[0] + '.countrycode = ' + abb[2] + '.countrycode' + \
        ' JOIN ' + subquery[3] + ' ON ' + \
        abb[0] + '.countrycode = ' + abb[3] + '.countrycode)) env' + \
        ' ON ' + short + '.countrycode = env.countrycode'
        
    query = query + ') x JOIN (select countrycode, value' + \
            ' FROM refugees WHERE year = ' + dotyyear[2:] + \
        ') y on x.countrycode = y.countrycode'
    
    return query

In [4]:
years = [x for x in range(2006, 2014)]

In [5]:
tables = ['forestarea', 'environment']
features = ['precipitation', 'extreme', 'arable_land', 'freshwater']
print makeSQLQuery(tables, years[0], features)

SELECT x.*, value as refugees FROM (SELECT fo.countrycode AS countrycode, fo.y2006 AS forestarea, env.precipitation, env.extreme, env.arable_land, env.freshwater FROM (SELECT countrycode, y2006 FROM forestarea) fo JOIN (SELECT pr.countrycode AS countrycode, pr.y2006 as precipitation, ex.y2006 as extreme, ar.y2006 as arable_land, fr.y2006 as freshwater FROM ((SELECT countrycode, y2006 FROM environment WHERE series='precipitation') pr JOIN (SELECT countrycode, y2006 FROM environment WHERE series='extreme') ex ON pr.countrycode = ex.countrycode JOIN (SELECT countrycode, y2006 FROM environment WHERE series='arable_land') ar ON pr.countrycode = ar.countrycode JOIN (SELECT countrycode, y2006 FROM environment WHERE series='freshwater') fr ON pr.countrycode = fr.countrycode)) env ON fo.countrycode = env.countrycode) x JOIN (select countrycode, value FROM refugees WHERE year = 2006) y on x.countrycode = y.countrycode


In [6]:
df = pd.DataFrame()

In [7]:
for year in years:
    query = makeSQLQuery(tables, year, features)
    year_data = pd.read_sql_query(query, engine)
    year_data['year'] = year
    year_data = year_data.set_index(['countrycode', 'year'])
    df = pd.concat([df, year_data])

In [8]:
df = df.dropna()

In [9]:
df['refugees'].quantile(q=0.9)

109686.00000000001

In [12]:
for y in years:
    print y, df.xs(y, level='year', axis=0)['refugees'].quantile(q=0.9)

2006 124247.6
2007 97128.0
2008 104584.2
2009 106411.8
2010 106371.2
2011 109194.6
2012 108614.4
2013 76703.0


In [83]:
df['binary'] = df['refugees'].apply(lambda x: 0 if x < df['refugees'].quantile(q=0.95) else 1)

In [13]:
df.describe()

Unnamed: 0,forestarea,precipitation,extreme,arable_land,freshwater,refugees
count,1224.0,1224.0,1224.0,1224.0,1224.0,1224.0
mean,30.56391,1113.254902,1.234982,15.922536,15008.162324,62441.800654
std,22.088667,766.613571,1.993244,13.790668,35396.126119,278828.799198
min,0.067909,51.0,0.0,0.051769,0.0,0.0
25%,11.092823,537.0,0.023082,5.151829,1141.54191,198.5
50%,30.608826,917.0,0.304579,11.917845,2944.990323,1473.0
75%,46.397661,1646.0,1.297429,22.543011,12316.252997,14637.0
max,98.50641,3240.0,9.226586,60.536222,323213.140961,3057661.0


In [16]:
df = df.swaplevel(0, 1, axis=0)

In [17]:
x_cols = ['forestarea', 'precipitation', 'extreme', 'arable_land', 'freshwater']

In [152]:
# convert each feature to z-score to normalize
for col in x_cols:
    df[col] = (df[col] - df[col].mean())/df[col].std(ddof=0)

In [69]:
def getScoreValues(X_train, 
                   X_test, 
                   y_train, 
                   y_test, 
                   model=KNeighborsClassifier(n_neighbors=6), 
                   verbose=True, 
                   get_features=True
                  ):
    model = model
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    accuracy_score_value = accuracy_score(y_test, y_pred)
    precision_score_value = precision_score(y_test, y_pred)
    recall_score_value = recall_score(y_test, y_pred)
    f1_score_value = f1_score(y_test, y_pred) 
    
    scores = pd.Series(data=[accuracy_score_value, precision_score_value, recall_score_value, f1_score_value],
                      index=["Accuracy", "Precision", "Recall", "F1"])
    
    feature_importances = None
    if get_features:
        try:
            feature_importances = model.feature_importances_
            #print(feature_importances)
        except AttributeError:
            pass
        
    fimp = pd.Series(data=feature_importances, index=x_cols)
    
    if verbose:
        #print(pd.concat([y_test,pd.Series(y_pred, index=y_test.index)], axis = 1))
        print('Accuracy: {}\nPrecision: {}\nRecall: {}\nf1: {}'.format(accuracy_score_value, \
                                                                       precision_score_value, \
                                                                       recall_score_value, \
                                                                       f1_score_value))
    return scores, fimp

In [14]:
X = df[x_cols]
y = df['binary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state=4444)

NameError: name 'x_cols' is not defined

In [70]:
getScoreValues(X_train, X_test, y_train, y_test, DecisionTreeClassifier())

NameError: name 'X_train' is not defined

In [66]:
# countries = pd.Series([x for x, y in df.index])
# arg_forestarea = df.loc['ARG']['forestarea']
# arg_forestarea
# int(fact(3))

# print "n, year, div_by, adj_forest, actual_forest"
# for i, val in enumerate(arg_forestarea):
#    index = arg_forestarea.index[i]
#    n = i + 1
#    tot = np.sum(np.array([x for x in range(1, n+1)]))
#    
#    vals = [arg_forestarea.loc[yr] for yr in range(min(arg_forestarea.index), index+1)]
#    weighted = [(i+1)*v for i, v in enumerate(vals)]
#    weighted
#    print n, index, tot, np.sum(np.array(weighted))/tot, val

test = np.arange(min(years)+1, max(years))
train = []
for y in test:
    tr = [yr for yr in years if yr<y]
    train.append(tr)
    print tr, y
    
training_sets = []
test_sets = []


[2006] 2007
[2006, 2007] 2008
[2006, 2007, 2008] 2009
[2006, 2007, 2008, 2009] 2010
[2006, 2007, 2008, 2009, 2010] 2011
[2006, 2007, 2008, 2009, 2010, 2011] 2012


In [67]:
for prev, year in zip(train, test):
    test_sets.append(df.xs(year, level='year', axis=0))
    
    for year in test:
        trainyear = pd.DataFrame()
        for y in prev:
            data = df.xs(y, level='year', axis=0)
            trainyear = pd.concat([trainyear, data])
    
    training_sets.append(trainyear)

In [71]:
def score_model(train, test, model):
    year = 2007
    dfscores = pd.DataFrame()
    for train, test in zip(train, test):
        X_train, X_test, y_train, y_test = train[x_cols], test[x_cols], train['binary'], test['binary']
        scores = getScoreValues(X_train, X_test, y_train, y_test, DecisionTreeClassifier())
        dfscores = dfscores.append(scores, ignore_index=True)
        year += 1
    avgscores = [(s, dfscores[s].mean()) for s in dfscores]
    totscores = pd.Series([dfscores[s].mean() for s in dfscores], index=[s for s in dfscores])
#     print avgscores
    return totscores

In [72]:
getScoreValues(X_train, X_test, y_train, y_test, DecisionTreeClassifier())

NameError: name 'X_train' is not defined

In [22]:
models = [DecisionTreeClassifier(), BernoulliNB(), GaussianNB(), 
          SVC(), RandomForestClassifier(), KNeighborsClassifier()]

In [23]:
modelnames = ["Decision Tree", "Bernoulli NB", "Gaussian NB", "SVM", "Random Forest", "K Neighbors"]

In [77]:
trainblah = training_sets[5]

In [78]:
testblah = test_sets[5]

In [81]:
X_train, X_test, y_train, y_test = trainblah[x_cols], testblah[x_cols], trainblah['binary'], testblah['binary']

In [82]:
getScoreValues(X_train, X_test, y_train, y_test, DecisionTreeClassifier())

Accuracy: 0.993464052288
Precision: 0.5
Recall: 1.0
f1: 0.666666666667


(Accuracy     0.993464
 Precision    0.500000
 Recall       1.000000
 F1           0.666667
 dtype: float64, forestarea       0.831498
 precipitation    0.000000
 extreme          0.000000
 arable_land      0.168502
 freshwater       0.000000
 dtype: float64)

In [2]:
dfmodelscores = pd.DataFrame(index=["Accuracy", "Precision", "Recall", "F1"], columns=modelnames)
for model, name in zip(models, modelnames):
    print name
    print score_model(training_sets, test_sets, model)
    dfmodelscores[name] = score_model(training_sets, test_sets, model)
dfmodelscores

NameError: name 'modelnames' is not defined

In [236]:
def getROCcurve(X_train, X_test, y_train, y_test, model):
    model = model
    model.fit(X_train, y_train)
    y_scores = model.predict_proba(X_test)[:,1]
    fpr, tpr, thresholds = roc_curve(y_test, y_scores)

    auc = roc_auc_score(y_test, y_scores)
#     print('AUC: {}'.format(auc))
    
    fig,ax = plt.subplots()
    ax.plot(fpr, tpr, label='AUC: %0.3f' % auc)

    fig.set_size_inches(12, 8, forward=True)
    plt.xlabel('False Postive Rate', fontsize=14)
    plt.ylabel('True Positive Rate', fontsize=14)
    plt.title('Random Forest ROC Curve for Climate-Triggered Refugee Disasters', fontsize=16)
    plt.legend(loc='best', fontsize=14)

In [257]:
test2013 = df.xs(2013, level='year', axis=0)

In [261]:
train2013 = pd.concat([training_sets[5], df.xs(2012, level='year', axis=0)])

In [262]:
X_train, X_test, y_train, y_test = train2013[x_cols], test2013[x_cols], train2013['zscores'], test2013['zscores']

In [265]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,forestarea,precipitation,extreme,arable_land,freshwater,zscores
year,countrycode,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2006,ARG,-0.889903,-0.688547,-0.529877,-0.243228,-0.216289,0
2006,AUS,-0.637493,-0.763846,0.927284,-0.713604,0.249731,0
2006,BRA,1.347646,0.857061,-0.370363,-0.561582,0.413721,0
2006,CHN,-0.446618,-0.617212,3.408522,-0.326671,-0.363975,0
2006,FRA,-0.064819,-0.323942,-0.611578,1.252985,-0.335708,0
2006,DEU,0.092749,-0.544555,-0.597863,1.291686,-0.387790,0
2006,IND,-0.347672,-0.038599,1.589611,2.685696,-0.389929,0
2006,IDN,1.040819,2.100152,-0.533217,-0.305779,-0.176705,0
2006,ITA,-0.029179,-0.370178,-0.610878,0.646417,-0.335744,0
2006,JPN,1.708985,0.734205,-0.603870,-0.302405,-0.328993,0


In [263]:
getROCcurve(X_train, X_test, y_train, y_test, SVC())

AttributeError: predict_proba is not available when probability=False

In [227]:
for s in dfscores:
    print dfscores[s].mean()

0.964784419402
0.889441587824
0.900448717949
0.880202526942


In [199]:
len(training_sets[0])

157

In [200]:
print len(training_sets[0][x_cols])
print len(training_sets[0]['zscores'])
print 

157
157


In [180]:
for ts in test_sets:
    print ts.size

942
936
936
936
936
936


In [181]:
for ts in training_sets:
    print ts.size

942
1884
2820
3756
4692
5628


In [166]:
df.xs(2006, level='year', axis=0).size

942

In [182]:
df.xs(2008, level='year', axis=0).size

936

In [170]:
train2008 = pd.DataFrame()
for y in train[1]:
    data = df.xs(y, level='year', axis=0)
    train2008 = pd.concat([train2008, data])
print train2008.size

1884
