### Plan
1. Read Data into Dataframe
2. Process String columns - lower case and remove spaces
3. Label Encode String columns in Train Data
4. Train on Training Data
5. Export Test Data from xlsx to csv
6. Read Test csvs into Dataframe
7. String Processing and Label Encoding for string Data
8. Predict and measure metrics for Test Data
9. Write test predictions to csvs (r1.csv,r2.csv,r3.csv,r4.csv)


In [2]:
import numpy as np # linear algebra
import pandas as pd # pandas for dataframe based data processing and CSV file I/O
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from IPython.core.interactiveshell import InteractiveShell
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import explained_variance_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
import matplotlib.dates as mdates
%matplotlib inline
import seaborn as sns
import math
import gc
import ipaddress
from urllib.parse import urlparse
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier

from data_science_utils import dataframe as df_utils
from data_science_utils import models as model_utils
from data_science_utils import plots as plot_utils
from data_science_utils.dataframe import column as column_utils

from IPython.display import display, HTML


from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix,classification_report


sns.set_style('whitegrid')
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

np.set_printoptions(threshold=np.nan)




plt.rcParams["figure.figsize"] = (24,4)

pd.set_option('display.max_seq_items', None)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import RobustScaler
import warnings
warnings.filterwarnings('ignore')


In [69]:
def string_column_processor(string):
    if string is None:
        return string
    string = string.strip().lower()
    return ' '.join(string.split())

In [70]:
df = pd.read_csv("Inventorys_Sold_BBB.csv")
df = df[:3163]

In [71]:
df.columns = ['id','brand','model','year','type','sold_marketplace','date','price']
df_utils.drop_columns_safely(df,["id"],inplace=True)
df.head()
df.shape

Unnamed: 0,brand,model,year,type,sold_marketplace,date,price
0,Redline,Conquest Pro,2015.0,Road,eBay,9/21/17,800.0
1,Schwinn,Impact (B897),2005.0,Mountain,eBay,9/27/17,65.0
2,Fuji,"SLM 2.0 BK MTB 21"" (B2197)",2009.0,Mountain,eBay,12/18/17,400.0
3,Raleigh,Detour iE,2015.0,Hybrid,eBay,9/21/17,700.0
4,Giant,Defy Advanced Pro 0,2015.0,Road,eBay,7/27/18,1750.0


(3163, 7)

In [62]:
for column in df.columns:
    print("# Unique values in %s = %s"%(column,df[column].nunique()))
    
df_utils.count_nulls(df)

# Unique values in brand = 167
# Unique values in model = 2040
# Unique values in year = 25
# Unique values in type = 16
# Unique values in sold_marketplace = 2
# Unique values in date = 375
# Unique values in price = 954


Unnamed: 0_level_0,count
Column,Unnamed: 1_level_1
type,2
date,1
brand,0
model,0
year,0
sold_marketplace,0
price,0


In [72]:
df['date'] = pd.to_datetime(df['date'],infer_datetime_format=True)
df['years_bought'] = df["date"].dt.year - df['year']

df['days_bought'] = (df["date"].dt.date - pd.to_datetime(df['year'].astype(int),format="%Y").dt.date).dt.days


### Features

In [73]:
known_values = {}
encoders = {}
for column in ['brand','model','type','sold_marketplace',]:
    df[column] = df[column].astype(str)
    df[column] = df[column].apply(string_column_processor)
    known_values[column] = set(df[column].unique())
    le = LabelEncoder()
    le.fit(list(df[column].unique())+["unknown"])
    df[column] = le.transform(df[column])
    df[column] = df[column].astype(int)
    encoders[column] = le

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

In [74]:
features = ["brand","model","year","sold_marketplace"]
target = "price"

In [147]:
xgr=XGBRegressor(n_estimators=100, learning_rate=0.6, gamma=2,max_depth=12,n_jobs=48,missing=np.nan)
xgr.fit(df[features],df[target])

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=2, learning_rate=0.6, max_delta_step=0,
       max_depth=12, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=48, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

### Testing

In [116]:
cols = ["brand","model","year","price","sold_marketplace"]
df_test1 = pd.read_csv("s1.csv")
df_test2 = pd.read_csv("s2.csv")
df_test3 = pd.read_csv("s3.csv")
df_test4 = pd.read_csv("s4.csv")
df_test1.columns = cols
df_test2.columns = cols
df_test3.columns = cols
df_test4.columns = cols

df_test1["year"] = pd.to_numeric(df_test1["year"], errors='coerce')
df_test2["year"] = pd.to_numeric(df_test2["year"], errors='coerce')
df_test3["year"] = pd.to_numeric(df_test3["year"], errors='coerce')
df_test4["year"] = pd.to_numeric(df_test4["year"], errors='coerce')


In [117]:
def label_encode_test(df):
    for column in ['brand','model','sold_marketplace']:
        df[column] = df[column].astype(str)
        df[column] = df[column].apply(string_column_processor)
        df.loc[~df[column].isin(known_values[column]),column] = "unknown"
        le = encoders[column]
        df[column] = le.transform(df[column])
        df[column] = df[column].astype(int)
    return df

In [118]:
label_encode_test(df_test1);
label_encode_test(df_test2);
label_encode_test(df_test3);
label_encode_test(df_test4);

In [122]:
def mean_absolute_percentage_error(y_true, y_pred):
    diff = np.abs((y_true - y_pred) / np.clip(np.abs(y_true),1,1e8))
    return 100. * np.mean(diff)
def predict_and_check_error(df):
    preds = np.round(xgr.predict(df[features]))
    y_true = df[target]
    y_pred = preds
    rmse = model_utils.rmse(y_true,y_pred)
    diff = 100.* np.abs((y_true - y_pred) / np.clip(np.abs(y_true),1,1e8))
    count_ten_percent = np.sum(diff<10)
    percent_cols = 100*count_ten_percent/len(y_true)
    mape = mean_absolute_percentage_error(df[target],preds)
    return {"preds":preds,"rmse":rmse,"mape":mape,"ten_percent":count_ten_percent,"ten_in_ten":percent_cols}

In [148]:
r1 = predict_and_check_error(df_test1)
r2 = predict_and_check_error(df_test2)
r3 = predict_and_check_error(df_test3)
r4 = predict_and_check_error(df_test4)

In [149]:
pd.DataFrame({
            "rmse":list(map(lambda r:r["rmse"],[r1,r2,r3,r4])),
             "mape":list(map(lambda r:r["mape"],[r1,r2,r3,r4])),
             "ten":list(map(lambda r:r["ten_percent"],[r1,r2,r3,r4])),
"ten_in_ten":list(map(lambda r:r["ten_in_ten"],[r1,r2,r3,r4]))})

Unnamed: 0,mape,rmse,ten,ten_in_ten
0,25.295923,386.637927,2828,68.5742
1,96.077302,881.41635,122,20.538721
2,96.077302,881.41635,122,20.538721
3,96.077302,881.41635,122,20.538721


In [151]:
pd.DataFrame({"preds":r1["preds"]}).to_csv("r1.csv",index=False)
pd.DataFrame({"preds":r2["preds"]}).to_csv("r2.csv",index=False)
pd.DataFrame({"preds":r3["preds"]}).to_csv("r3.csv",index=False)
pd.DataFrame({"preds":r4["preds"]}).to_csv("r4.csv",index=False)

### EDA Concepts

- Price depends on how many years after it is sold
- Price deprecation per year per brand

- Label encode brand,model,type,sold_marketplace
- type average, brand average
