<a href="https://colab.research.google.com/github/duduteddy/python/blob/master/pipeline_online.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Add all relevant imports here
import os
import json
import numpy as np
import pandas as pd
import time

import xgboost as xgb
import sklearn.svm as svm
import sklearn.tree as tree
import sklearn.ensemble as ensemble
import sklearn.neighbors as neighbors
import sklearn.naive_bayes as naive_bayes
import sklearn.linear_model as linear_model

from sklearn.metrics import r2_score
from sklearn.impute import SimpleImputer
from sklearn import preprocessing as preproc
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import accuracy_score, log_loss, mean_squared_error, mean_absolute_error, roc_curve, auc
from sklearn import model_selection
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression



from google.colab import files
from google.colab import drive
# # uploaded = files.upload()


In [0]:
def splitMetaData(row_id):

    # Parse data from MetaData
    column_key = {'name': 'C', 'columns': 'W',"preprocessing":"AG", "Libraries": "AS",'estimator_func_call': 'AU', 'target_name': 'AC', 'output_type': 'AA', 'performance_metric': 'BB', 'feature_selector': 'AL','featureExtractor function call': 'AJ'}
    column_key = dict(map(lambda kv: (kv[0], alpha_to_number(kv[1])), column_key.items()))

    metadata['competition_name'] = rows.loc[row_id][column_key['name']]
    metadata['Libraries'] = rows.loc[row_id][column_key['Libraries']]
    metadata['preprocessing'] = rows.loc[row_id][column_key['preprocessing']]
    metadata['estimator'] = rows.loc[row_id][column_key['estimator_func_call']]
    metadata['target_column'] = rows.loc[row_id][column_key['target_name']]
    metadata['output_type'] = rows.loc[row_id][column_key['output_type']].split(',')
    metadata['metric'] = rows.loc[row_id][column_key['performance_metric']]
    metadata['feature_selector'] = rows.loc[row_id][column_key['feature_selector']]
    metadata['featureExtractor'] = rows.loc[row_id][column_key['featureExtractor function call']]
    columns = rows.loc[row_id][column_key['columns']]


    # Parse column information 
    numeric_columns = []
    unwanted_columns = []
    categorical_columns = []
    columns_data = [x.strip() for x in columns[1:-1].split(';')]
    print('columns_data: ', columns_data)
    for ind, val in enumerate(columns_data):
        if ind%3 == 2:
            if (val == "numeric" or val == "integer" or val == "real"):
                numeric_columns.append(columns_data[ind-1])
            elif val == "categorical":
                categorical_columns.append(columns_data[ind-1])
            elif val == "unwanted" or val == 'dateTime':
                unwanted_columns.append(columns_data[ind-1])
        else:
            pass

    metadata['numeric_columns'] = numeric_columns
    metadata['unwanted_columns'] = unwanted_columns
    metadata['categorical_columns'] = categorical_columns

    # Remove target from features columns
    if metadata['target_column'] in metadata['numeric_columns']:
        metadata['numeric_columns'].remove(metadata['target_column'])
    if metadata['target_column'] in metadata['categorical_columns']:
        metadata['categorical_columns'].remove(metadata['target_column'])
    if metadata['target_column'] in metadata['unwanted_columns']:
        metadata['unwanted_columns'].remove(metadata['target_column'])

    print('competition name :' + metadata['competition_name'])
    print('numeric_columns:', metadata['numeric_columns'])
    print('categorical_columns:' ,metadata['categorical_columns'])
    print('unwanted_columns: ' , metadata['unwanted_columns'])
    print('target_column:' ,metadata['target_column'])
    print('metric: ' , metadata['metric'])
    print('feature_selector: ' , metadata['feature_selector'])
    print('featureExtractor', metadata['featureExtractor'])
    print('estimator: ' , metadata['estimator'])
    print("preprocessing: ",metadata['preprocessing'])

In [0]:
def preprocessing(train_df,test_df=None):
    
    train_num = train_df.shape[0]
    data = pd.concat([train_df,test_df])
    if metadata['unwanted_columns']:
        data.drop(metadata['unwanted_columns'], axis=1, inplace=True)

    X = data.drop(metadata['target_column'], 1)
    y = data[metadata['target_column']]
        
    if 'label encoding' in metadata['preprocessing']:
        for c in metadata['categorical_columns']:
            lbl = LabelEncoder()
            lbl.fit(list(data[c])) 
            data[c] = lbl.transform(list(data[c]))
    

    if 'label categorical'in metadata['preprocessing']:
        obj=list(data.select_dtypes(include = ['object']).columns)
        for c in (obj):
            if len(data[c].unique()) == 2:
                uni = data[c].unique()[0]
                data[c + '_numeric'] = (data[c].values == uni)
        data.drop(obj,axis=1,inplace=True)
    if 'label target'in metadata['preprocessing']:
        lbl_enc = LabelEncoder()
        data[metadata['target_column']] = lbl_enc.fit_transform(data[metadata['target_column']].astype(str))
    
    if 'normalization/scaling' in metadata['preprocessing']:

        from sklearn.preprocessing import MinMaxScaler
        min_max_scaler = MinMaxScaler()
        data = pd.DataFrame(data=min_max_scaler.fit_transform(data),columns=data.columns, index=data.index)
      
      
    if "nan processing" in metadata['preprocessing']:
        data = data.fillna(0)
      
      
    if 'train test split' in metadata['preprocessing']:

        X = data.drop(metadata['target_column'], 1)
        if 'log label' in metadata['preprocessing']:
          y = data[metadata['target_column']].apply(np.log)
        else:
          y = data[metadata['target_column']]
        
        X_train, X_test, y_train, y_test = train_test_split(X,y)
    
    else:
        train = data[:train_num]
        test = data[train_num:]
        X_train = data.drop(metadata['target_column'], axis=1)
        y_train = data[metadata['target_column']]
        y_test = test[metadata['target_column']]
        X_test =test.drop(metadata['target_column'],axis=1)
        
    if 'as type' in metadata['preprocessing']:
        X_test = X_test.astype(float)
        X_train = X_train.astype(float)
        
    
    return X_train, X_test, y_train, y_test


In [0]:
# Feature Extraction
def feature_extraction(X_train, X_test, y_train, y_test):
    extractor=eval(metadata['featureExtractor'])
    if 'countvectorizer' in metadata['featureExtractor'].lower():

      X_train = extractor.fit_transform(X_train.values.flatten())
      X_test = extractor.transform(X_test.values.flatten())
    else:
      X_train = extractor.fit_transform(X_train.values)
      X_test = extractor.transform(X_test.values)
    
    return X_train, X_test, y_train, y_test


In [0]:
def estimation(X_train, X_test, y_train, y_test):
    if metadata['Libraries']=="xgb":
        model = eval(metadata['estimator'])
        predict = model.predict(xgb.DMatrix(np.array(X_test)))
    else:
        model = eval(metadata['estimator'])
        print(type(model))
        model.fit(X_train, y_train)
        predict = model.predict(X_test)        
        print("predict:", predict)

    if metadata['metric'] == "rmse":
        loss = np.sqrt(mean_squared_error(y_test.values, predict))
    elif metadata['metric'] == "r^2":
        loss = r2_score(y_test, predict)
    elif metadata['metric'] == "logloss":
        predict_proba=model.predict_proba(X_test)
        loss=multiclass_logloss(y_test, predict_proba)

    print(metadata['metric'], " :", loss)
    return loss

In [96]:
# Mount Google Drive
drive.mount('/content/gdrive')


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
# Accessing Google sheets
!pip install --upgrade -q gspread
from google.colab import auth
auth.authenticate_user()
import gspread
from oauth2client.client import GoogleCredentials

gc = gspread.authorize(GoogleCredentials.get_application_default())

worksheet = gc.open('AutoKaggle').worksheet('Metadata')
_rows = worksheet.get_all_values()

# Convert to a DataFrame and render.
import pandas as pd
rows = pd.DataFrame.from_records(_rows)

new_header = rows.iloc[0] #grab the first row for the header
rows = rows[1:] #take the data less the header row
rows.columns = new_header #set the header row as the df header

In [98]:
row_ids = [303, 437]
metadata = {}


cwd = "/content/gdrive/My Drive/Introduction to Data Science Spring 2019 Term Project/jh5976_yd1196/"
for row_id in row_ids:
  
    # Parsing MetaData updates the metadata dict
    metadata.clear()
    start = time.time()
    splitMetaData(row_id)
    competition_dir = cwd + metadata['competition_name'] + '/data/'


    train_df = pd.read_csv(competition_dir + 'trainData.csv' )
    test_df = pd.read_csv(competition_dir + 'testData.csv')
    if test_df is not None:
        X_train, X_test, y_train, y_test = preprocessing(train_df, test_df)
    else:
        X_train, X_test, y_train, y_test = preprocessing(train_df)
  
    
    print(X_train.head(2))
    if metadata['featureExtractor']:
        X_train, X_test, y_train, y_test = feature_extraction(X_train, X_test, y_train, y_test)
    
    if metadata['feature_selector']:
        X_train, X_test, y_train, y_test = feature_selection(X_train, X_test, y_train, y_test)
    print(X_test.shape)
    estimation(X_train, X_test, y_train, y_test)
    end = time.time()
    print('runningTimeSeconds is :'+str(end - start))


columns_data:  ['0', 'id', 'integer', '1', 'Open Date', 'dateTime', '2', 'City', 'unwanted', '3', 'City Group', 'categorical', '4', 'Type', 'categorical', '5', 'P1', 'integer', '6', 'P2', 'integer', '7', 'P3', 'integer', '0', 'P36', 'integer', '1', 'P37', 'integer', '2', 'revenue', 'integer']
competition name :restaurant-revenue-prediction
numeric_columns: ['id', 'P1', 'P2', 'P3', 'P36', 'P37']
categorical_columns: ['City Group', 'Type']
unwanted_columns:  ['Open Date', 'City']
target_column: revenue
metric:  rmse
feature_selector:  
featureExtractor 
estimator:  linear_model.LinearRegression()
preprocessing:  nan processing, normalization/scaling,label encoding
   City Group       Id        P1       P10       P11    P12       P13  \
0         0.0  0.00000  0.214286  0.166667  0.222222  0.375  0.444444   
1         0.0  0.00001  0.214286  0.166667  0.000000  0.375  0.444444   

        P14  P15       P16  ...       P35   P36  P37        P4        P5  \
0  0.066667  0.2  0.133333  ...  