In [1]:
from itertools import chain
import html
import ujson as json
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LinearRegression

%matplotlib inline
sns.set(context='talk', font='monospace')
df = pd.read_pickle('../armslist_data.pkl')

lr = LinearRegression()

# Linear Regressions Against Price

## Setup

In [10]:
df = df.ix[(df.price_usd >= 50) & (df.price_usd <= 4000), :]
df = df.ix[~(df.manufacturer.isin([' ', '', '?'])), :]
df = df.ix[df.manufacturer.notnull(), :]
df = df.ix[~(df.manufacturer.isin([' ', '', '?'])), :]
df = df.ix[df.caliber.notnull(), :]

### Caliber, Caliber (Split) and Manufacturer

In [11]:
df_X = pd.get_dummies(df.ix[:, ['manufacturer', 'caliber', 'caliber_nums', 'caliber_chars']])
lr.fit(df_X, df.price_usd)
print(lr.coef_.shape)
print(lr.intercept_)
lr.score(df_X, df.price_usd)

(520,)
8.32063612929e+14


0.4548311462259626

### Caliber (Split) and Manufacturer

In [12]:
df_X = pd.get_dummies(df.ix[:, ['manufacturer', 'caliber_nums', 'caliber_chars']])
lr.fit(df_X, df.price_usd)
print(lr.coef_.shape)
print(lr.intercept_)
lr.score(df_X, df.price_usd)

(350,)
-6.56458762971e+14


0.45474077770615062

 ### Caliber (full) and Manufacturer

In [13]:
df_X = pd.get_dummies(df.ix[:, ['manufacturer', 'caliber_nums']])
lr.fit(df_X, df.price_usd)
print(lr.coef_.shape)
print(lr.intercept_)
lr.score(df_X, df.price_usd)

(265,)
-5.98470572445e+14


0.43930641467545417

### Manufacturer

In [14]:
df_X = pd.get_dummies(df.ix[:, ['manufacturer']])
lr.fit(df_X, df.price_usd)
print(lr.coef_.shape)
print(lr.intercept_)
lr.score(df_X, df.price_usd)

(154,)
-5.41134861118e+13


0.30434226222537619

### Caliber (split)

In [15]:
df_X = pd.get_dummies(df.ix[:, ['caliber_nums', 'caliber_chars']])
lr.fit(df_X, df.price_usd)
print(lr.coef_.shape)
print(lr.intercept_)
lr.score(df_X, df.price_usd)

(196,)
-2.29007375147e+14


0.26145072639702371

### Caliber, Caliber (Split), Manufacturer, Category_2, Category_3

In [16]:
df_X = pd.get_dummies(df.ix[:, ['category_2', 'category_3', 'manufacturer', 'caliber', 'caliber_nums', 'caliber_chars']])
lr.fit(df_X, df.price_usd)
print(lr.coef_.shape)
print(lr.intercept_)
lr.score(df_X, df.price_usd)

(546,)
-1.98849658219e+14


0.45943226210233801

### Caliber, Caliber (Split), Manufacturer, Category_3

In [17]:
df_X = pd.get_dummies(df.ix[:, ['category_3', 'manufacturer', 'caliber', 'caliber_nums', 'caliber_chars']])
lr.fit(df_X, df.price_usd)
print(lr.coef_.shape)
print(lr.intercept_)
lr.score(df_X, df.price_usd)

(529,)
-3.39427346414e+14


0.45870502083568876

### Caliber, Caliber (Split), Manufacturer, Category_2

In [18]:
df_X = pd.get_dummies(df.ix[:, ['category_2', 'manufacturer', 'caliber', 'caliber_nums', 'caliber_chars']])
lr.fit(df_X, df.price_usd)
print(lr.coef_.shape)
print(lr.intercept_)
lr.score(df_X, df.price_usd)

(537,)
1.99473332861e+13


0.45768923106322346

### Category_2, Category_3

In [19]:
df_X = pd.get_dummies(df.ix[:, ['category_2', 'category_3']])
lr.fit(df_X, df.price_usd)
print(lr.coef_.shape)
print(lr.intercept_)
lr.score(df_X, df.price_usd)

(26,)
-8.23790910345e+13


0.065160908160569631

# Linear Regressions Against Log of Price

## Category_2, Category_3, Manufacturer, Caliber

In [20]:
df_X = pd.get_dummies(df.ix[:, ['category_2', 'category_3', 'manufacturer', 'caliber']])
lr.fit(df_X, df.price_usd.apply(np.log))
print(lr.coef_.shape)
print(lr.intercept_)
lr.score(df_X, df.price_usd.apply(np.log))

(350,)
244536482793.0


0.51674725081701034

## Manufacturer, Caliber

In [38]:
df_X = pd.get_dummies(df.ix[:, ['manufacturer', 'caliber']])
lr.fit(df_X, df.price_usd.apply(np.log))
print(lr.coef_.shape)
print(lr.intercept_)
lr.score(df_X, df.price_usd.apply(np.log))

(324,)
-84192845863.4


0.53791073694284131

# Linear Regressions Against Risk Scores

We should be doing some train/test splits, but first things first.

In [47]:
cp2_df = pd.read_pickle('../cp2_data.pkl')
risk_map = {'low': 0.0, 'medium': 0.5, 'high': 1.0}
cp2_df['risk_as_num'] = cp2_df.risk_profile.apply(lambda x: risk_map[x])
cp2_df = cp2_df.ix[~(cp2_df.manufacturer.isin([' ', '', '?'])), :]
cp2_df = cp2_df.ix[cp2_df.manufacturer.notnull(), :]
cp2_df = cp2_df.ix[~(cp2_df.manufacturer.isin([' ', '', '?'])), :]
cp2_df = cp2_df.ix[cp2_df.caliber.notnull(), :]
cp2_df = cp2_df.dropna()

In [48]:
df_X = pd.get_dummies(cp2_df.ix[:, ['category_2', 'category_3', 'manufacturer', 'caliber', 'price_usd']])
lr.fit(df_X, cp2_df.risk_as_num)
print(lr.coef_.shape)
print(lr.intercept_)
lr.score(df_X, cp2_df.risk_as_num)

(174,)
655125011.436


0.33249356306196431

In [49]:
cp2_df = cp2_df.ix[(cp2_df.price_usd >= 50) & (cp2_df.price_usd <= 4000), :]

In [50]:
df_X = pd.get_dummies(cp2_df.ix[:, ['category_2', 'category_3', 'manufacturer', 'caliber', 'price_usd']])
lr.fit(df_X, cp2_df.risk_as_num)
print(lr.coef_.shape)
print(lr.intercept_)
lr.score(df_X, cp2_df.risk_as_num)

(171,)
286785116.256


0.32489943329740817