In [1]:
# @TODO: try this for y-values
# sometimes this is called “non-parametric” in classical statistics
# rank-based /percentile cutoff

In [1]:
from sqlalchemy import create_engine
import pandas as pd
import numpy as np

from sklearn.cross_validation import train_test_split, StratifiedShuffleSplit
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
                            accuracy_score, 
                            precision_score, 
                            recall_score, 
                            f1_score, 
                            roc_curve, 
                            roc_auc_score, 
                            confusion_matrix
                            )
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.learning_curve import learning_curve
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import confusion_matrix

from unbalanced_dataset import OverSampler, SMOTE

import matplotlib.pyplot as plt
# import seaborn as sns

import statsmodels.api as sm
from statsmodels.tsa.ar_model import AR, ARResults
from scipy.misc import factorial as fact

%matplotlib inline

In [2]:
CONFIG = {}

with open('config.txt', "r") as in_file:
    for line in in_file:
        line = line.split(":")
        parameter = line[0].strip()
        value = line[1].strip()
        CONFIG[parameter] = value
                
engine_path = 'postgresql://'+CONFIG['username']+':'+\
                CONFIG['password']+'@'+CONFIG['ip']+\
                ':'+CONFIG['port']+'/'+CONFIG['database']
        
engine = create_engine(engine_path)

##Want something like:

In [3]:
def makeSQLQuery(table_names, year, seriesnames):
    dotyyear = '.y' + str(year)
    short = table_names[0][0:2]
    seriesnames = seriesnames
    abb = [ser[0:2] for ser in seriesnames]
    query = 'SELECT x.*, y' + dotyyear + ' as zscores FROM (' + \
            'SELECT ' + short + '.countrycode AS countrycode, ' + \
            short + dotyyear + ' AS ' + table_names[0] 
    
    mystr = ''
    for s in seriesnames:
        mystr = mystr + ', env.' + s
    mystr = mystr
    
    query = query + mystr
    
    query = query + ' FROM (SELECT countrycode, ' + dotyyear[1:] + \
        ' FROM ' + table_names[0] + ') ' + short + ' JOIN ' + \
        '(SELECT ' + abb[0] + '.countrycode AS countrycode'
    
    mystr = ''
    for a, ser in zip(abb, seriesnames):
        mystr = mystr + ', ' + a + dotyyear + ' as ' + ser
    query = query + mystr
    
    query = query + ' FROM ('
    
    subquery = []
    for a, ser in zip(abb, seriesnames):
        mystr = ''
        mystr = mystr + '(SELECT countrycode, ' + dotyyear[1:] + \
        ' FROM ' + table_names[1] + " WHERE series='" + ser + \
        "') " + a
        subquery.append(mystr)
    
    query = query + subquery[0] + ' JOIN ' + subquery[1] + ' ON ' + \
        abb[0] + '.countrycode = ' + abb[1] + '.countrycode' + \
        ' JOIN ' + subquery[2] + ' ON ' + \
        abb[0] + '.countrycode = ' + abb[2] + '.countrycode' + \
        ' JOIN ' + subquery[3] + ' ON ' + \
        abb[0] + '.countrycode = ' + abb[3] + '.countrycode)) env' + \
        ' ON ' + short + '.countrycode = env.countrycode'
        
    query = query + ') x JOIN (select countrycode, ' + dotyyear[1:] + \
            ' FROM zscores) y on x.countrycode = y.countrycode'
    
    return query

In [4]:
years = [x for x in range(2006, 2014)]

In [5]:
tables = ['forestarea', 'environment']
features = ['precipitation', 'extreme', 'arable_land', 'freshwater']
print makeSQLQuery(tables, years[0], features)

SELECT x.*, y.y2006 as zscores FROM (SELECT fo.countrycode AS countrycode, fo.y2006 AS forestarea, env.precipitation, env.extreme, env.arable_land, env.freshwater FROM (SELECT countrycode, y2006 FROM forestarea) fo JOIN (SELECT pr.countrycode AS countrycode, pr.y2006 as precipitation, ex.y2006 as extreme, ar.y2006 as arable_land, fr.y2006 as freshwater FROM ((SELECT countrycode, y2006 FROM environment WHERE series='precipitation') pr JOIN (SELECT countrycode, y2006 FROM environment WHERE series='extreme') ex ON pr.countrycode = ex.countrycode JOIN (SELECT countrycode, y2006 FROM environment WHERE series='arable_land') ar ON pr.countrycode = ar.countrycode JOIN (SELECT countrycode, y2006 FROM environment WHERE series='freshwater') fr ON pr.countrycode = fr.countrycode)) env ON fo.countrycode = env.countrycode) x JOIN (select countrycode, y2006 FROM zscores) y on x.countrycode = y.countrycode


In [13]:
df = pd.DataFrame()

In [18]:
for year in years:
    query = makeSQLQuery(tables, year, features)
    year_data = pd.read_sql_query(query, engine)
    year_data['year'] = year
    year_data = year_data.set_index(['countrycode', 'year'])
    df = pd.concat([df, year_data])

In [21]:
df.columns

Index([u'forestarea', u'precipitation', u'extreme', u'arable_land',
       u'freshwater', u'zscores'],
      dtype='object')

In [16]:
df.size

7506

In [24]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,forestarea,precipitation,extreme,arable_land,freshwater,zscores
countrycode,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ARG,2006,10.913914,591,0.16719,12.735823,7305.496838,-0.278010269
AUS,2006,16.499616,534,3.047099,6.211031,23622.500912,-0.279337781
BRA,2006,60.429569,1761,0.482451,8.319794,29364.390723,-0.27919335
CHN,2006,20.723562,645,7.950971,11.57835,2134.480626,-0.272709544
FRA,2006,29.172541,867,0.005718,33.490456,3124.207769,-0.279344809


In [27]:
df['zscores'] = df['zscores'].astype(float)

In [29]:
df = df.dropna()

In [30]:
df.describe()

Unnamed: 0,forestarea,precipitation,extreme,arable_land,freshwater,zscores
count,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0
mean,30.606953,1112.219025,1.21443,16.10975,14878.54233,0.040514
std,22.133838,757.13496,1.976777,13.874216,35020.530613,1.143605
min,0.067909,51.0,0.0,0.051769,0.0,-0.293136
25%,11.078235,562.0,0.023082,5.127726,1147.784317,-0.276353
50%,30.513576,1010.0,0.269239,11.936709,3063.4586,-0.262625
75%,46.556322,1604.0,1.295973,22.676412,12391.666391,-0.197219
max,98.50641,3240.0,9.226586,60.536222,323213.140961,9.868634


In [36]:
df['binary'] = df['zscores'].apply(lambda x: 0 if x<0 else 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [39]:
len(df[df['binary']==1])

394

In [40]:
len(df)

2502

In [41]:
394./2502*100

15.74740207833733

In [42]:
df.describe()

Unnamed: 0,forestarea,precipitation,extreme,arable_land,freshwater,zscores,binary
count,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0
mean,30.606953,1112.219025,1.21443,16.10975,14878.54233,0.040514,0.157474
std,22.133838,757.13496,1.976777,13.874216,35020.530613,1.143605,0.36432
min,0.067909,51.0,0.0,0.051769,0.0,-0.293136,0.0
25%,11.078235,562.0,0.023082,5.127726,1147.784317,-0.276353,0.0
50%,30.513576,1010.0,0.269239,11.936709,3063.4586,-0.262625,0.0
75%,46.556322,1604.0,1.295973,22.676412,12391.666391,-0.197219,0.0
max,98.50641,3240.0,9.226586,60.536222,323213.140961,9.868634,1.0


In [43]:
df = df.swaplevel(0, 1, axis=0)

In [150]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,forestarea,precipitation,extreme,arable_land,freshwater,zscores
year,countrycode,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2006,ARG,10.913914,591,0.16719,12.735823,7305.496838,0
2006,AUS,16.499616,534,3.047099,6.211031,23622.500912,0
2006,BRA,60.429569,1761,0.482451,8.319794,29364.390723,0
2006,CHN,20.723562,645,7.950971,11.57835,2134.480626,0
2006,FRA,29.172541,867,0.005718,33.490456,3124.207769,0


In [44]:
x_cols = ['forestarea', 'precipitation', 'extreme', 'arable_land', 'freshwater']

In [45]:
# convert each feature to z-score to normalize
for col in x_cols:
    df[col] = (df[col] - df[col].mean())/df[col].std(ddof=0)

In [65]:
def getScoreValues(X_train, 
                   X_test, 
                   y_train, 
                   y_test, 
                   model=KNeighborsClassifier(n_neighbors=6), 
                   verbose=True, 
                   get_features=True
                  ):
    model = model
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    
    
    accuracy_score_value = accuracy_score(y_test, y_pred)
    precision_score_value = precision_score(y_test, y_pred)
    recall_score_value = recall_score(y_test, y_pred)
    f1_score_value = f1_score(y_test, y_pred) 
    
    scores = pd.Series(data=[accuracy_score_value, precision_score_value, recall_score_value, f1_score_value],
                      index=["Accuracy", "Precision", "Recall", "F1"])
    
#     feature_importances = None
#     if get_features:
#         try:
#             feature_importances = model.feature_importances_
#             #print(feature_importances)
#         except AttributeError:
#             pass
        
#     fimp = pd.Series(data=feature_importances, index=x_cols)
    
#     if verbose:
#         #print(pd.concat([y_test,pd.Series(y_pred, index=y_test.index)], axis = 1))
#         print('Accuracy: {}\nPrecision: {}\nRecall: {}\nf1: {}'.format(accuracy_score_value, \
#                                                                        precision_score_value, \
#                                                                        recall_score_value, \
#                                                                        f1_score_value))
    return scores

In [49]:
X = df[x_cols]
y = df['binary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state=4444)

In [54]:
getScoreValues(X_train, X_test, y_train, y_test, DecisionTreeClassifier())

Accuracy: 0.98535286285
Precision: 0.95041322314
Recall: 0.958333333333
f1: 0.954356846473


(Accuracy     0.985353
 Precision    0.950413
 Recall       0.958333
 F1           0.954357
 dtype: float64, forestarea       0.257187
 precipitation    0.333671
 extreme          0.061081
 arable_land      0.116111
 freshwater       0.231950
 dtype: float64)

In [70]:
# countries = pd.Series([x for x, y in df.index])
# arg_forestarea = df.loc['ARG']['forestarea']
# arg_forestarea
# int(fact(3))

# print "n, year, div_by, adj_forest, actual_forest"
# for i, val in enumerate(arg_forestarea):
#    index = arg_forestarea.index[i]
#    n = i + 1
#    tot = np.sum(np.array([x for x in range(1, n+1)]))
#    
#    vals = [arg_forestarea.loc[yr] for yr in range(min(arg_forestarea.index), index+1)]
#    weighted = [(i+1)*v for i, v in enumerate(vals)]
#    weighted
#    print n, index, tot, np.sum(np.array(weighted))/tot, val

test = np.arange(min(years)+1, max(years))
train = []
for y in test:
    tr = [yr for yr in years if yr<y]
    train.append(tr)
    print tr, y
    
training_sets = []
test_sets = []


[2006] 2007
[2006, 2007] 2008
[2006, 2007, 2008] 2009
[2006, 2007, 2008, 2009] 2010
[2006, 2007, 2008, 2009, 2010] 2011
[2006, 2007, 2008, 2009, 2010, 2011] 2012


In [71]:
for prev, year in zip(train, test):
    test_sets.append(df.xs(year, level='year', axis=0))
    
    for year in test:
        trainyear = pd.DataFrame()
        for y in prev:
            data = df.xs(y, level='year', axis=0)
            trainyear = pd.concat([trainyear, data])
    
    training_sets.append(trainyear)

In [63]:
def score_model(train, test, model):
    year = 2007
    dfscores = pd.DataFrame()
    for train, test in zip(train, test):
        X_train, X_test, y_train, y_test = train[x_cols], test[x_cols], train['binary'], test['binary']
        scores = getScoreValues(X_train, X_test, y_train, y_test, DecisionTreeClassifier())
        dfscores = dfscores.append(scores, ignore_index=True)
        year += 1
    avgscores = [(s, dfscores[s].mean()) for s in dfscores]
    totscores = pd.Series([dfscores[s].mean() for s in dfscores], index=[s for s in dfscores])
#     print avgscores
    return totscores

In [92]:
score_model(training_sets, test_sets, DecisionTreeClassifier())

Accuracy     0.957313
F1           0.867704
Precision    0.872809
Recall       0.866848
dtype: float64

In [67]:
smallxcols = ['forestarea', 'precipitation', 'freshwater']

In [58]:
models = [DecisionTreeClassifier(), BernoulliNB(), GaussianNB(), 
          SVC(), RandomForestClassifier(), KNeighborsClassifier()]

In [59]:
modelnames = ["Decision Tree", "Bernoulli NB", "Gaussian NB", "SVM", "Random Forest", "K Neighbors"]

In [98]:
training_sets[0]

Unnamed: 0_level_0,forestarea,precipitation,extreme,arable_land,freshwater,zscores,binary
countrycode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ARG,-0.889903,-0.688547,-0.529877,-0.243228,-0.216289,-0.278010,0
AUS,-0.637493,-0.763846,0.927284,-0.713604,0.249731,-0.279338,0
BRA,1.347646,0.857061,-0.370363,-0.561582,0.413721,-0.279193,0
CHN,-0.446618,-0.617212,3.408522,-0.326671,-0.363975,-0.272710,0
FRA,-0.064819,-0.323942,-0.611578,1.252985,-0.335708,-0.279345,0
DEU,0.092749,-0.544555,-0.597863,1.291686,-0.387790,-0.279339,0
IND,-0.347672,-0.038599,1.589611,2.685696,-0.389929,-0.278466,0
IDN,1.040819,2.100152,-0.533217,-0.305779,-0.176705,-0.269940,0
ITA,-0.029179,-0.370178,-0.610878,0.646417,-0.335744,-0.279312,0
JPN,1.708985,0.734205,-0.603870,-0.302405,-0.328993,-0.279343,0


In [96]:
dfmodelscores = pd.DataFrame(index=["Accuracy", "Precision", "Recall", "F1"], columns=modelnames)
for model, name in zip(models, modelnames):
    print model
    print score_model(training_sets, test_sets, model)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=None, splitter='best')
Accuracy     0.958381
F1           0.870277
Precision    0.878158
Recall       0.866848
dtype: float64
BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
Accuracy     0.957313
F1           0.865147
Precision    0.881351
Recall       0.853191
dtype: float64
GaussianNB()
Accuracy     0.958374
F1           0.869219
Precision    0.883016
Recall       0.860438
dtype: float64
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)
Accuracy     0.957319
F1           0.868609
Precision    0.870183
Recall       0.872422
dtype: float64
RandomForestClassifier(bootstrap=True, class_weight=None

In [93]:
def score_model(train, test, model):
    year = 2007
    dfscores = pd.DataFrame()
    for train, test in zip(train, test):
        X_train, X_test, y_train, y_test = train[smallxcols], test[smallxcols], train['binary'], test['binary']
        scores = getScoreValues(X_train, X_test, y_train, y_test, DecisionTreeClassifier())
        dfscores = dfscores.append(scores, ignore_index=True)
        year += 1
    avgscores = [(s, dfscores[s].mean()) for s in dfscores]
    totscores = pd.Series([dfscores[s].mean() for s in dfscores], index=[s for s in dfscores])
#     print avgscores
    return totscores

In [76]:
models

[DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             random_state=None, splitter='best'),
 BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True),
 GaussianNB(),
 SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
   kernel='rbf', max_iter=-1, probability=False, random_state=None,
   shrinking=True, tol=0.001, verbose=False),
 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
             max_depth=None, max_features='auto', max_leaf_nodes=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
             oob_score=False, random_state=None, verbose=0,
             warm_start=False),
 KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
            metri

In [82]:
type(score_model(training_sets, test_sets, DecisionTreeClassifier()))

pandas.core.series.Series

In [97]:
dfmodelscoresSmall = pd.DataFrame(index=["Accuracy", "Precision", "Recall", "F1"], columns=modelnames)
for model, name in zip(models, modelnames):
    print name
    print score_model(training_sets, test_sets, model)

Decision Tree
Accuracy     0.961579
F1           0.879918
Precision    0.886166
Recall       0.879668
dtype: float64
Bernoulli NB
Accuracy     0.957319
F1           0.867667
Precision    0.872194
Recall       0.866848
dtype: float64
Gaussian NB
Accuracy     0.956251
F1           0.863618
Precision    0.871166
Recall       0.860438
dtype: float64
SVM
Accuracy     0.958381
F1           0.871334
Precision    0.874817
Recall       0.873258
dtype: float64
Random Forest
Accuracy     0.959456
F1           0.873727
Precision    0.877941
Recall       0.873258
dtype: float64
K Neighbors
Accuracy     0.958381
F1           0.871447
Precision    0.875378
Recall       0.873258
dtype: float64


In [236]:
def getROCcurve(X_train, X_test, y_train, y_test, model):
    model = model
    model.fit(X_train, y_train)
    y_scores = model.predict_proba(X_test)[:,1]
    fpr, tpr, thresholds = roc_curve(y_test, y_scores)

    auc = roc_auc_score(y_test, y_scores)
#     print('AUC: {}'.format(auc))
    
    fig,ax = plt.subplots()
    ax.plot(fpr, tpr, label='AUC: %0.3f' % auc)

    fig.set_size_inches(12, 8, forward=True)
    plt.xlabel('False Postive Rate', fontsize=14)
    plt.ylabel('True Positive Rate', fontsize=14)
    plt.title('Random Forest ROC Curve for Climate-Triggered Refugee Disasters', fontsize=16)
    plt.legend(loc='best', fontsize=14)

In [257]:
test2013 = df.xs(2013, level='year', axis=0)

In [261]:
train2013 = pd.concat([training_sets[5], df.xs(2012, level='year', axis=0)])

In [262]:
X_train, X_test, y_train, y_test = train2013[x_cols], test2013[x_cols], train2013['zscores'], test2013['zscores']