In [1]:
print("HI")

HI


In [6]:
import pandas as pd
import numpy as np
import gc
import pickle
import xgboost as xgb
import matplotlib.pyplot as plt
from numba import jit

def OpenFile(file):
    with open(file, 'rb') as fi:
        data = pickle.load(fi)
    return data
    
@jit
def mcc(tp, tn, fp, fn):
    sup = tp * tn - fp * fn
    inf = (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)
    if inf==0:
        return 0
    else:
        return sup / np.sqrt(inf)

@jit
def eval_mcc(y_true, y_prob):
    idx = np.argsort(y_prob)
    y_true_sort = y_true[idx]
    n = y_true.shape[0]
    nump = 1.0 * np.sum(y_true) # number of positive
    numn = n - nump # number of negative
    tp = nump
    tn = 0.0
    fp = numn
    fn = 0.0
    best_mcc = 0.0
    prev_proba = -1
    mccs = np.zeros(n)
    for i in range(n):
        # all items with idx < i are predicted negative while others are predicted positive
        # only evaluate mcc when probability changes
        proba = y_prob[idx[i]]
        if proba != prev_proba:
            prev_proba = proba
            new_mcc = mcc(tp, tn, fp, fn)
            if new_mcc >= best_mcc:
                best_mcc = new_mcc
        mccs[i] = new_mcc
        if y_true_sort[i] == 1:
            tp -= 1.0
            fn += 1.0
        else:
            fp -= 1.0
            tn += 1.0
    return best_mcc
        
def mcc_eval(y_prob, dtrain):
    y_true = dtrain.get_label()
    best_mcc = eval_mcc(y_true, y_prob)
    return 'MCC', best_mcc
    
def ReadTestData(files, cols):
    directory = 'test/'
    testData = None
    for j,f in enumerate(files):
        print("file: " + f)
        subset = None
        for i, chunk in enumerate(pd.read_csv(directory + f, \
                                                chunksize = 50000, \
                                                usecols=cols, \
                                                low_memory = False)):
            print("chunk " + str(i))
            if subset is None:
                subset = chunk.copy()
            else:
                subset = pd.concat([subset, chunk])
            del chunk
            gc.collect()
        if testData is None:
            testData = subset.copy()
        else:
            testData = pd.merge(testData, subset.copy(), on="Id")
        del subset
        gc.collect()
    with open('testData_datesOnly.pk', 'wb') as fi:
        pickle.dump(testData, fi)    
        
def ReadTrainData_Resp1():
    Directory = 'train/'
    Files = ['train_numeric.csv', \
                  'train_categorical.csv', \
                  'train_date.csv']

    trainData = None
    maskArr = None
    for j,f in enumerate(Files):
        print("file: " + f)
        subset = None
        for i, chunk in enumerate(pd.read_csv(Directory + f, \
                                                chunksize = 50000, \
                                                low_memory = False)):
            print("chunk " + str(i))
            if j == 0:
                chunk = chunk[chunk['Response'] == 1]
            if subset is None:
                subset = chunk.copy()
            else:
                subset = pd.concat([subset, chunk])
            del chunk
            gc.collect()
            if j == 0:
                if subset.size > 0:
                    if maskArr is None:
                        maskArr = np.array(subset['Id'])
                    else:
                        maskArr = np.append(maskArr, subset['Id'])
            else:
                subset = subset[subset['Id'].isin(maskArr)]
        if trainData is None:
            trainData = subset.copy()
        else:
            trainData = pd.merge(trainData, subset.copy(), on="Id")
        del subset
        gc.collect()
    with open('Response_1s.pk', 'wb') as fi:
        pickle.dump(trainData, fi)    
        
def ReadTrainData_newFeatures():
    Directory = 'train/'
    Files = ['train_numeric.csv', \
                  'train_categorical.csv', \
                  'train_date.csv']

    trainData = None
    for j,f in enumerate(Files):
        print("file: " + f)
        subset = None
        for i, chunk in enumerate(pd.read_csv(Directory + f, \
                                                chunksize = 50000, \
                                                low_memory = False)):
            print("chunk " + str(i))
            sumCnt = chunk.notnull().sum(axis=1)
            sumCnt[:] = [x - 1 for x in sumCnt]
            if j == 0:
                sumCnt[:] = [x - 1 for x in sumCnt]
                chunkMod = pd.DataFrame({"Id": chunk.Id.values, \
                               "NumericCnt": sumCnt, \
                               "Response": chunk.Response.values}) 
            elif j == 1:
                chunkMod = pd.DataFrame({"Id": chunk.Id.values, \
                               "CategoricalCnt": sumCnt})
            elif j == 2:
                chunkMod = pd.DataFrame({"Id": chunk.Id.values, \
                               "DateCnt": sumCnt})
            if subset is None:
                subset = chunkMod.copy()
            else:
                subset = pd.concat([subset, chunkMod])
            del chunk
            gc.collect()
            
        if trainData is None:
            trainData = subset.copy()
        else:
            trainData = pd.merge(trainData, subset.copy(), on="Id")
        del subset
        gc.collect()
    with open('Train_ValCnts.pk', 'wb') as fi:
        pickle.dump(trainData, fi) 

def ReadTrainData_byDate(cols):
    trainData = None
    subset = None
    for i, chunk in enumerate(pd.read_csv('train/train_date.csv', \
                                                chunksize = 50000, \
                                                usecols=cols, \
                                                low_memory = False)):
        print("chunk " + str(i))
        dateCols = np.setdiff1d(chunk.columns, ['Id'])
        chunk['Start_Time'] = chunk[dateCols].min(axis=1).values
        if subset is None:
            subset = chunk.copy()
        else:
            subset = pd.concat([subset, chunk])
        del chunk
        gc.collect()
    if trainData is None:
        trainData = subset.copy()
    else:
        trainData = pd.merge(trainData, subset.copy(), on="Id")
    del subset
    gc.collect()
   
                                                    
    subset = None                                                
    for i, chunk in enumerate(pd.read_csv('train/train_numeric.csv', \
                                                chunksize = 50000, \
                                                usecols=['Response', 'Id'], \
                                                low_memory = False)):
        print("chunk " + str(i))
        chunk = chunk[['Response', 'Id']]
        if subset is None:
            subset = chunk.copy()
        else:
            subset = pd.concat([subset, chunk])
        del chunk
        gc.collect()
    if trainData is None:
        trainData = subset.copy()
    else:
        trainData = pd.merge(trainData, subset.copy(), on="Id")
    del subset
    gc.collect()
    with open('Full_byDate_trainData.pk', 'wb') as fi:
        pickle.dump(trainData, fi)
    return trainData

def ReadDateTrainData(cols):
    Directory = 'train/'
    Files = ['train_date.csv']
    trainData = None
    for j,f in enumerate(Files):
        print("file: " + f)
        subset = None
        for i, chunk in enumerate(pd.read_csv(Directory + f, \
                                                chunksize = 50000, \
                                                usecols=cols, \
                                                low_memory = False)):
            print("chunk " + str(i))
            if subset is None:
                subset = chunk.copy()
            else:
                subset = pd.concat([subset, chunk])
            del chunk
            gc.collect()
        if trainData is None:
            trainData = subset.copy()
        else:
            trainData = pd.merge(trainData, subset.copy(), on="Id")
        del subset
        gc.collect()
    with open('Date_Data.pk', 'wb') as fi:
        pickle.dump(trainData, fi)
    return trainData

def ReadCatTrainData():
    Directory = 'train/'
    Files = ['train_categorical.csv']
    trainData = None
    for j,f in enumerate(Files):
        print("file: " + f)
        subset = None
        for i, chunk in enumerate(pd.read_csv(Directory + f, \
                                                chunksize = 50000, \
                                                low_memory = False)):
            print("chunk " + str(i))
            if subset is None:
                subset = chunk.copy()
            else:
                subset = pd.concat([subset, chunk])
            del chunk
            gc.collect()
        if trainData is None:
            trainData = subset.copy()
        else:
            trainData = pd.merge(trainData, subset.copy(), on="Id")
        del subset
        gc.collect()
    with open('Numeric_Data.pk', 'wb') as fi:
        pickle.dump(trainData, fi)
    return trainData
    
def trainModel(trainData):
    trainData = trainData.sort_values(by=['Start_Time', 'Id'], ascending=True)
    ytrain = trainData.pop('Response')
    #trainData.pop('Id')
    prior = np.sum(ytrain) / (1.*len(ytrain))
    xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.7,
    'silent': 1,
    'subsample': 0.7,
    'learning_rate': 0.1,
    'objective': 'binary:logistic',
    'max_depth': 4,
    'num_parallel_tree': 1,
    'min_child_weight': 2,
    'eval_metric': 'auc',
    'base_score': prior }   
    
    xgdmat = xgb.DMatrix(trainData, label=ytrain)
    cv_xgb = xgb.cv(params = xgb_params, dtrain = xgdmat, \
                    num_boost_round=10, \
                    nfold = 4, \
                    seed = 0, \
                    stratified = True, \
                    feval=mcc_eval, \
                    maximize=True,  \
                early_stopping_rounds = 1, \
                verbose_eval=1, show_stdv=True )
    return cv_xgb
#    
#    train_xgb = xgb.train(params = our_params, dtrain = xgdmat, \
#                    feval=mcc_eval, \
#                    maximize=True)
                
#    testFiles = ['test_date.csv']
#    testCols = dateCols
#    ReadTestData(testFiles, testCols)
                
#    testData = OpenFile('testData_datesOnly.pk')
#
#    testdmat = xgb.DMatrix(testData)
#    y_pred = train_xgb.predict(testdmat)
#    thresholdVal = .3
#    low_val = y_pred < thresholdVal
#    high_val = ~low_val
#    y_pred[low_val] = 0
#    y_pred[high_val] = 1
#    y_pred = y_pred.astype(int)
#    submission = pd.DataFrame({"Id": testData.Id.values, \
#                               "Response": y_pred})
#    submission[['Id', 'Response']].to_csv('xgbsubmission.csv', \
#                                            index=False) 

    
if __name__ == "__main__":
    print('Started')

    #ReadTrainData_newFeatures()
    trainData = OpenFile('/Pickle files/Train_ValCnts.pk')
#    xgb = trainModel(trainData)

#    CatCnt = np.array(trainData.CategoricalCnt)
#    NumericCnt = np.array(trainData.NumericCnt)
#    DateCnt = np.array(trainData.DateCnt)
#    Resp1 = np.array(trainData.Response == 1)
#    CatCnt2 = CatCnt[Resp1]
#    NumericCnt2 = NumericCnt[Resp1]
#    DateCnt2 = DateCnt[Resp1]
    
#    trainData = OpenFile('Response_1s.pk')    
#    dateData = trainData[trainData.columns[-1156:]].copy()
#    dateData = dateData.dropna(axis=1, how='all')
#    dateData = dateData.T.drop_duplicates().T 
#    dateCols = np.array(dateData.columns)
#    dateCols = np.append(['Id'], dateCols)
    #ReadTrainData_byDate(dateCols)
    #trainData = OpenFile('Full_byDate_trainData.pk') 
    #ReadNumericTrainData()
    #trainData = OpenFile('Numeric_Data.pk')
    #xgb = trainModel(trainData)
    #ReadDateTrainData(dateCols)
#    with open('Date_Data.pk', 'rb') as fi:
#        trainDateData = pickle.load(fi)
#    q = trainDateData.iloc[:,:].values
#    q=q[~np.isnan(q)]
#    z = plt.hist(q, bins=20)
#    p = dateData.iloc[:,:].values
#    p=p[~np.isnan(p)]
#    y = plt.hist(p, bins=20)
#    z=z[0]
#    y=y[0]
#    zz = [a_i - b_i for a_i, b_i in zip(z, y)]
#    xx = [a_i / b_i for a_i, b_i in zip(y, zz)]
#    plt.plot(xx)

    
    print('Finished')

Started


FileNotFoundError: [Errno 2] No such file or directory: '/Pickle files/Train_ValCnts.pk'

# HI #

In [7]:
dir

<function dir>

In [8]:
import os
cwd = os.getcwd()

In [9]:
cwd

'C:\\Users\\farha\\Documents\\Python\\Kaggle\\Bosch'

In [18]:
trainData = OpenFile('\\Pickle_files\\Train_ValCnts.pk')

FileNotFoundError: [Errno 2] No such file or directory: '\\Pickle_files\\Train_ValCnts.pk'

In [24]:
trainData = OpenFile('\\Train_ValCnts.pk')

FileNotFoundError: [Errno 2] No such file or directory: '\\Train_ValCnts.pk'

In [22]:
os.path.dirname(os.path.abspath('Train_ValCnts.pk'))

'C:\\Users\\farha\\Documents\\Python\\Kaggle\\Bosch'

In [27]:
trainData = OpenFile('C:\\Users\\farha\\Documents\\Python\\Kaggle\\Bosch\\Train_ValCnts.pk')

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\farha\\Documents\\Python\\Kaggle\\Bosch\\Train_ValCnts.pk'

In [29]:
os.path.curdir

'.'

In [31]:
for subdir, dirs, files in os.walk('./'):
    for file in files:
      print (file)

Bosch_PL.ipynb
main.py
Bosch_PL-checkpoint.ipynb
Date_Data.pk
Full_byDate_trainData.pk
Numeric_Data.pk
Response_1s.pk
testData_datesOnly.pk
Train_ValCnts.pk
sample_submission.csv
xgbsubmission.csv
test_categorical.csv
test_date.csv
test_numeric.csv
train_categorical.csv
train_categorical_less.csv
train_date.csv
train_date_less.csv
train_numeric.csv
train_numeric_less.csv


In [33]:
trainData = OpenFile("Train_ValCnts.pk")

FileNotFoundError: [Errno 2] No such file or directory: 'Train_ValCnts.pk'