In [1]:
from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import shutil
import os
import requests
import base64


# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)


# Encode text values to a single dummy variable.  The new columns (which do not replace the old) will have a 1
# at every location where the original column (name) matches each of the target_values.  One column is added for
# each target value.
def encode_text_single_dummy(df, name, target_values):
    for tv in target_values:
        l = list(df[name].astype(str))
        l = [1 if str(x) == str(tv) else 0 for x in l]
        name2 = "{}-{}".format(name, tv)
        df[name2] = l


# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.  Is it really this hard? :(
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(target_type, '__iter__') else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df.as_matrix(result).astype(np.float32), dummies.as_matrix().astype(np.float32)
    else:
        # Regression
        return df.as_matrix(result).astype(np.float32), df.as_matrix([target]).astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


# Regression chart.
def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
               * (normalized_high - normalized_low) + normalized_low
        
# This function submits an assignment.  You can submit an assignment as much as you like, only the final
# submission counts.  The paramaters are as follows:
# data - Pandas dataframe output.
# key - Your student key that was emailed to you.
# no - The assignment class number, should be 1 through 1.
# source_file - The full path to your Python or IPYNB file.  This must have "_class1" as part of its name.  
# .             The number must match your assignment number.  For example "_class2" for class assignment #2.
def submit(data,key,no,source_file=None):
    if source_file is None and '__file__' not in globals(): raise Exception('Must specify a filename when a Jupyter notebook.')
    if source_file is None: source_file = __file__
    suffix = '_class{}'.format(no)
    if suffix not in source_file: raise Exception('{} must be part of the filename.'.format(suffix))
    with open(source_file, "rb") as image_file:
        encoded_python = base64.b64encode(image_file.read()).decode('ascii')
    ext = os.path.splitext(source_file)[-1].lower()
    if ext not in ['.ipynb','.py']: raise Exception("Source file is {} must be .py or .ipynb".format(ext))
    r = requests.post("https://api.heatonresearch.com/assignment-submit",
        headers={'x-api-key':key}, json={'csv':base64.b64encode(data.to_csv(index=False).encode('ascii')).decode("ascii"),
        'assignment': no, 'ext':ext, 'py':encoded_python})
    if r.status_code == 200:
        print("Success: {}".format(r.text))
    else: print("Failure: {}".format(r.text))

In [5]:
from sklearn.linear_model import LassoCV
import pandas as pd
import os
import numpy as np
from sklearn import metrics
from scipy.stats import zscore 
import sklearn
from sklearn.model_selection import train_test_split 
from sklearn.grid_search import GridSearchCV
from scipy.stats import boxcox
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb

df = pd.read_csv('../input/train.csv')

df.drop('id', axis=1, inplace=True)

encode_text_dummy(df, 'manufacturer')

n1=df['name'].str.contains('Paperclips').astype(int)
n2=df['name'].str.contains('Pencils').astype(int)
n3=df['name'].str.contains('Pens').astype(int)
n4=df['name'].str.contains('Tablets').astype(int)
n5=df['name'].str.contains('Thumbtacks').astype(int)
n6=df['name'].str.contains('Paperweights').astype(int)
n7=df['name'].str.contains('Stapler').astype(int)
n8=df['name'].str.contains('Post').astype(int)

df.insert(1,'Paperclips',n1)
df.insert(1,'Pencils',n2)
df.insert(1,'Pens',n3)
df.insert(1,'Tablets',n4)
df.insert(1,'Thumbtacks',n5)
df.insert(1,'Paperweights',n6)
df.insert(1,'Stapler',n7)
df.insert(1,'Post',n8)

N1=df['name'].str.contains('High').astype(int)
N2=df['name'].str.contains('Medium').astype(int)
N3=df['name'].str.contains('Large').astype(int)
N4=df['name'].str.contains('Small').astype(int)
N5=df['name'].str.contains('Tiny').astype(int)

df.insert(1,'High',N1)
df.insert(1,'Medium',N2)
df.insert(1,'Large',N3)
df.insert(1,'Small',N4)
df.insert(1,'Tiny',N5)

C1=df['name'].str.contains('Red').astype(int)
C2=df['name'].str.contains('Pink').astype(int)
C3=df['name'].str.contains('Black').astype(int)
C4=df['name'].str.contains('Green').astype(int)
C5=df['name'].str.contains('White').astype(int)
C6=df['name'].str.contains('Blue').astype(int)
C7=df['name'].str.contains('Brown').astype(int)

df.insert(1,'Red',C1)
df.insert(1,'Pink',C2)
df.insert(1,'Black',C3)
df.insert(1,'Green',C4)
df.insert(1,'White',C5)
df.insert(1,'Blue',C6)
df.insert(1,'Brown',C7)

M1=df['name'].str.contains('Generic').astype(int)
df.insert(1,'Generic',M1)


df.drop('name', axis=1, inplace=True)


df['cost']=np.log(df['cost'])
df['pack']=np.log(df['pack'])
df['weight']=np.log(df['weight'])

df['height']=np.log(df['height'])
df['width']=np.log(df['width'])
df['length']=np.log(df['length'])



x,y = to_xy(df,"cost")


model_gbr = GradientBoostingRegressor(n_estimators=4000, learning_rate=0.05,
                                   max_depth=5, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)
model_gbr.fit(x,y)

model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=1400,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)
model_xgb.fit(x,y)
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)
model_lgb.fit(x,y)





  y = column_or_1d(y, warn=True)


SVR(C=1.0, cache_size=800, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [6]:
tdf = pd.read_csv('../input/test.csv')

ids = tdf['id']
tdf.drop('id', axis=1, inplace=True)


encode_text_dummy(tdf, 'manufacturer')

n1=tdf['name'].str.contains('Paperclips').astype(int)
n2=tdf['name'].str.contains('Pencils').astype(int)
n3=tdf['name'].str.contains('Pens').astype(int)
n4=tdf['name'].str.contains('Tablets').astype(int)
n5=tdf['name'].str.contains('Thumbtacks').astype(int)
n6=tdf['name'].str.contains('Paperweights').astype(int)
n7=tdf['name'].str.contains('Stapler').astype(int)
n8=tdf['name'].str.contains('Post').astype(int)

tdf.insert(1,'Paperclips',n1)
tdf.insert(1,'Pencils',n2)
tdf.insert(1,'Pens',n3)
tdf.insert(1,'Tablets',n4)
tdf.insert(1,'Thumbtacks',n5)
tdf.insert(1,'Paperweights',n6)
tdf.insert(1,'Stapler',n7)
tdf.insert(1,'Post',n8)

N1=tdf['name'].str.contains('High').astype(int)
N2=tdf['name'].str.contains('Medium').astype(int)
N3=tdf['name'].str.contains('Large').astype(int)
N4=tdf['name'].str.contains('Small').astype(int)
N5=tdf['name'].str.contains('Tiny').astype(int)

tdf.insert(1,'High',N1)
tdf.insert(1,'Medium',N2)
tdf.insert(1,'Large',N3)
tdf.insert(1,'Small',N4)
tdf.insert(1,'Tiny',N5)

C1=tdf['name'].str.contains('Red').astype(int)
C2=tdf['name'].str.contains('Pink').astype(int)
C3=tdf['name'].str.contains('Black').astype(int)
C4=tdf['name'].str.contains('Green').astype(int)
C5=tdf['name'].str.contains('White').astype(int)
C6=tdf['name'].str.contains('Blue').astype(int)
C7=tdf['name'].str.contains('Brown').astype(int)

tdf.insert(1,'Red',C1)
tdf.insert(1,'Pink',C2)
tdf.insert(1,'Black',C3)
tdf.insert(1,'Green',C4)
tdf.insert(1,'White',C5)
tdf.insert(1,'Blue',C6)
tdf.insert(1,'Brown',C7)

M1=tdf['name'].str.contains('Generic').astype(int)
tdf.insert(1,'Generic',M1)

tdf.drop('name', axis=1, inplace=True)

tdf['pack']=np.log(tdf['pack'])
tdf['weight']=np.log(tdf['weight'])

tdf['height']=np.log(tdf['height'])
tdf['width']=np.log(tdf['width'])
tdf['length']=np.log(tdf['length'])

x_test = tdf.as_matrix().astype(np.float32)


tmodel_gbr = model_gbr.predict(x_test)
pred1=np.exp(tmodel_gbr)

tmodel_xgb=model_xgb.predict(x_test)
pred2=np.exp(tmodel_xgb)

tmodel_lgb=model_lgb.predict(x_test)
pred3=np.exp(tmodel_lgb)

pred=pred1*0.75+pred2*0.15+pred3*0.1

df_submit = pd.DataFrame(pred)
df_submit.insert(0,'id',ids)
df_submit.columns = ['id','cost']

df_submit.to_csv('submission.csv', index=False)