In [40]:
# data organization libraries
import numpy as np
import pandas as pd
# viz
import matplotlib.pyplot as plt
import seaborn as sns
# modeling libraries
import sklearn as sk
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

In [41]:
# get raw
raw = pd.read_csv('raw_df.csv')

raw.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area,area_cat,temp_bins
0,1,2,8,5,91.0,166.9,752.6,7.1,25.9,41,3.6,0.0,0.0,0-5,>25
1,5,4,8,2,95.1,141.3,605.8,17.7,26.4,34,3.6,0.0,16.4,10-50,>25
2,6,5,8,3,92.1,111.2,654.1,9.6,16.6,47,0.9,0.0,2.29,0-5,15-20
3,6,3,8,4,91.6,138.1,621.7,6.3,18.9,41,3.1,0.0,10.34,10-50,15-20
4,4,5,8,0,90.2,99.6,631.2,6.3,21.4,33,3.1,0.0,0.0,0-5,20-25


In [42]:
len(raw)

2652

In [43]:
temp_bins = raw['temp_bins'].value_counts()

#temp_bins

In [44]:
temp_twen = raw[raw['temp_bins'] == '20-25']

#temp_twen.head()

In [45]:
area_fifty = raw[raw['area_cat'] == '10-50']

area_fifty.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area,area_cat,temp_bins
1,5,4,8,2,95.1,141.3,605.8,17.7,26.4,34,3.6,0.0,16.4,10-50,>25
3,6,3,8,4,91.6,138.1,621.7,6.3,18.9,41,3.1,0.0,10.34,10-50,15-20
8,2,5,8,5,93.9,135.7,586.7,15.1,23.5,36,5.4,0.0,10.02,10-50,20-25
16,4,3,8,3,94.5,139.4,689.1,20.0,28.9,29,4.9,0.0,49.59,10-50,>25
26,7,4,8,0,81.6,56.7,665.6,1.9,21.2,70,6.7,0.0,11.16,10-50,20-25


In [46]:
# names = ['temp_bins', 'area_cat'], keys = ['>25', '0-5']
def filter_df(df, names, keys):
    
    if len(keys) == 2:
    
        ret = df[(df[names[0]] == keys[0]) & (df[names[1]] == keys[1])]
    
    else:
        
        ret = df[df[names[0]] == keys[0]]
        
    
    return ret


In [47]:
high_temp_low_area = filter_df(raw, ['temp_bins'], ['20-25'])

#high_temp_low_area.head()

In [48]:
len(high_temp_low_area)

840

In [49]:
# bias_dics = [month_dic, temp_dic, area_dic], type:list

In [50]:
#temp_dic = {'15-20': .5, '20-25': .25, '0-15': .2, '>25': .05}

#for i in temp_dic: 
    #print(i, temp_dic[i]) 

In [51]:
def sk_test_train(df, target, features, test_ratio):
    X_df = df.iloc[:, features]
    
    X = X_df.to_numpy()
    
    y_df = df.iloc[:, target]
    
    y = y_df.to_numpy()
    
    y = y.flatten()
    
    # split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_ratio, random_state=0)
    
    return X_train, X_test, y_train, y_test

In [52]:
def sample_dic(cat_dic, size):
    
    samp_dic = {}
    
    for i in cat_dic:
        
        samp_dic[i] = size * cat_dic[i]
    
    return samp_dic
    
    
    

In [150]:
def gen_sample(df, num, rand):
    
    return df.sample(n=num, random_state=rand, replace=True)

In [54]:
def biased_sample(df, size, bias_dics):
    
    ret_df = pd.DataFrame()
    
    samp_dic = sample_dic(bias_dics[0], size)
    
    first = True
    
    for i in samp_dic:
        
        names = ['temp_bins']
        
        keys = [i]
        
        data = filter_df(df, names, keys)
        
        rs = int(1)
        
        b_samp = gen_sample(data, int(samp_dic[i]), rs)
        
        if first == True:
            
            ret_df = b_samp
            
        else:
            
            ret_df = pd.concat([ret_df, b_samp])
        
        first = False
        
    
    return ret_df    

In [55]:
temp_dic = {'15-20': .5, '20-25': .25, '0-15': .2, '>25': .05}

biased = biased_sample(raw, 300, [temp_dic])

#biased.head()

In [56]:
len(biased)

300

In [57]:
biased.to_csv('test_set_v1.csv', index=False)  

In [58]:
#biased.hist(bins=15, figsize=(20,15))

In [59]:
train_b, test_b = train_test_split(biased, test_size=0.4, random_state=42, shuffle=True)

len(test_b)

120

In [180]:
def test_train_gen(df, test_info, train_info, dics):
    # get args for call to biased_sample()
    
    # bias ditribution dictionaries
    test_dics = dics[0]
    
    train_dics = dics[1]
    
    # size of returned data sets
    test_size = test_info[0]
    
    train_size = train_info[0]
    
    #number of biased samples
    test_bias = test_size * test_info[1]
    
    train_bias = train_size * train_info[1]
    
    #number of random samples in common
    test_com = int(test_size - test_bias)
    
    train_com = int(train_size - train_bias)
    
    # split common data away
    
    raw, common = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)
    
    #split again to create unseen data in test set
    
    pre_train, pre_test = train_test_split(raw, test_size=0.5, random_state=32, shuffle=True)
    
    # get biased samples from raw split
    
    te_bi_sample = biased_sample(pre_test, test_bias, test_dics)
    
    tr_bi_sample = biased_sample(pre_train, train_bias, train_dics)
    
    # get rand samples from common
    
    te_com_sample = gen_sample(common, test_com , 21)
    
    tr_com_sample = gen_sample(common, train_com ,22)
    
    # concatinate and shuffle test data
    
    test_cat = pd.concat([te_bi_sample, te_com_sample])
    
    test = test_cat.sample(frac=1).reset_index(drop=True)
    
    # concatinate and shuffle train data
    
    train_cat = pd.concat([tr_bi_sample, tr_com_sample])
    
    train = train_cat.sample(frac=1).reset_index(drop=True)
    
    return test, train    

In [181]:
len(raw)

2652

In [182]:
def get_mse_naive(y, y_hat):
    """
    Calculate the MSE with numpy functions
    Do not use any sklearn functions
    
    y - Labels for the data
    y_hat - Predicted label for the data
    
    return MSE
    """
    return np.mean((y - y_hat)**2)

In [183]:
def test_train_linear_regression(X_train, y_train, X_test, y_test):
    """
    X_train - Training data
    y_train - Training labels
    
    Return reg, an instance of LinearRegression.fit() that represents the trained model
    """
    
    mod = LinearRegression().fit(X_train, y_train)
    
    y_hat = mod.predict(X_test)
    
    

    error = get_mse_naive(y_test, y_hat)
    
    return error

In [184]:
def df_to_array(df, features, target):
    X_df = df.iloc[:, features]
    
    X = X_df.to_numpy()
    
    y_df = df.iloc[:, target]
    
    y = y_df.to_numpy()
    
    y = y.flatten()
    
    return X, y

In [185]:
raw.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area,area_cat,temp_bins
0,1,2,8,5,91.0,166.9,752.6,7.1,25.9,41,3.6,0.0,0.0,0-5,>25
1,5,4,8,2,95.1,141.3,605.8,17.7,26.4,34,3.6,0.0,16.4,10-50,>25
2,6,5,8,3,92.1,111.2,654.1,9.6,16.6,47,0.9,0.0,2.29,0-5,15-20
3,6,3,8,4,91.6,138.1,621.7,6.3,18.9,41,3.1,0.0,10.34,10-50,15-20
4,4,5,8,0,90.2,99.6,631.2,6.3,21.4,33,3.1,0.0,0.0,0-5,20-25


In [186]:
feats = [1,2,3,4,5,6,7,8,9,10,11]

targ = [12]

In [215]:
test_temp = {'15-20': .5, '20-25': .25, '0-15': .2, '>25': .05}

train_temp = {'15-20': .1, '20-25': .5, '0-15': .35, '>25': .05}

temps = [[test_temp], [train_temp]]

te_info = (200, .2)

tr_info = (1000, .5)





test_v1, train_v1 = test_train_gen(raw, te_info, tr_info, temps)


In [216]:
test_v1.shape, train_v1.shape

((200, 15), (1000, 15))

In [217]:
X_test, y_test = df_to_array(test_v1, feats, targ)

X_train, y_train = df_to_array(train_v1, feats, targ)

In [218]:
error = test_train_linear_regression(X_train, y_train, X_test, y_test)

error

143.04776690645244

In [142]:
test_v1.to_csv('test_set_v2.csv', index=False)
train_v1.to_csv('train_set_v1.csv', index=False)