In [1]:
# import all the libraries

# Data exploration and analysis tools
import pandas as pd
import seaborn as sns
import numpy as np
from ast import literal_eval
import re as re

# Data cleaning for machine learning models
from sklearn.model_selection import train_test_split #split data into testing and training data
from sklearn.feature_selection import SelectKBest # identify best X that may predict Y
from sklearn.feature_selection import mutual_info_regression #needed for SelectKBest
from sklearn.preprocessing import StandardScaler #handle outliers after selecting K best guess variables that predict Y

# Machine Learning model
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
#reg = LinearRegression(fit_intercept=True)
#fit_intercept = True; hyper parameter for linear regression, add one-extra term - a start value (a starting weight); rarely False



# Error Measures
from sklearn.dummy import DummyRegressor
# Use DummyRegressor to compare your linear regression to the dumbest possible

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error, mean_absolute_error



In [2]:
bike_data = pd.read_csv('bike_index_api_stolenessall.csv')

In [3]:
bike_df = pd.DataFrame(data=bike_data)

In [4]:
bike_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188574 entries, 0 to 188573
Data columns (total 14 columns):
Unnamed: 0           188574 non-null int64
date_stolen          62097 non-null float64
frame_colors         188574 non-null object
frame_model          170000 non-null object
id                   188574 non-null int64
is_stock_img         188574 non-null bool
large_img            78875 non-null object
manufacturer_name    188567 non-null object
serial               188211 non-null object
stolen               188574 non-null bool
stolen_location      61129 non-null object
thumb                78875 non-null object
title                188573 non-null object
year                 133836 non-null float64
dtypes: bool(2), float64(2), int64(2), object(8)
memory usage: 17.6+ MB


This bike data has all datapoints ever collected in the BikeIndex API. What I really want are datapoints for bikes that are in San Francisco. For the purposes of this project, I am going to narrow this data set to bikes that have some representation of California. Of the data that I know is in California, there will be some NaNs as I'm digging into San Francisco data specifically.

In [5]:
bike_df_clean = bike_df.copy()
bike_df_clean = bike_df_clean.dropna(subset=['stolen_location'])
bike_df_clean = bike_df_clean[bike_df_clean['stolen_location'].apply(str).str.contains('CA|California|San Francisco')]

In [6]:
def split_zipcode(x):
    array=re.findall('\d{5}',str(x))
    if len(array)==0:
        return None
    else:
        return array[0]
            
bike_df_clean['stolen_zipcode'] = bike_df_clean['stolen_location'].apply(split_zipcode)

In [7]:
bike_df_clean

Unnamed: 0.1,Unnamed: 0,date_stolen,frame_colors,frame_model,id,is_stock_img,large_img,manufacturer_name,serial,stolen,stolen_location,thumb,title,year,stolen_zipcode
33,33,1.541264e+09,['Blue'],Cross-Check,462239,False,https://files.bikeindex.org/uploads/Pu/140320/...,Surly,YS-PC20270,True,"San Francisco,CA,94105",https://files.bikeindex.org/uploads/Pu/140320/...,2014 Surly Cross-Check,2014.0,94105
39,39,1.541236e+09,"['Red', 'Silver, gray or bare metal']",OCR 3,461486,False,https://files.bikeindex.org/uploads/Pu/140180/...,Giant,absent,True,"San Francisco,CA,94114",https://files.bikeindex.org/uploads/Pu/140180/...,2007 Giant OCR 3,2007.0,94114
40,40,1.541236e+09,"['Silver, gray or bare metal']",Thin 7,461723,False,https://files.bikeindex.org/uploads/Pu/140253/...,Sondors,MT17004959,True,"Berkeley,CA,94704",https://files.bikeindex.org/uploads/Pu/140253/...,Sondors Thin 7,,94704
41,41,1.541221e+09,['Black'],N/a,461764,False,https://files.bikeindex.org/uploads/Pu/140264/...,Not visible on bike,absent,True,"San Francisco,CA,94110",https://files.bikeindex.org/uploads/Pu/140264/...,Not visible on bike N/a,,94110
42,42,1.541259e+09,['White'],Lightweight 6061 Aluminum Frame,460962,False,,SXL,absent,True,"Los Angeles,CA,90007",,2018 SXL Lightweight 6061 Aluminum Frame,2018.0,90007
47,47,1.541200e+09,"['Silver, gray or bare metal']",Mountain,462623,False,,Genesis,no number,True,"Chico,CA,95973",,Genesis Mountain,,95973
48,48,1.541192e+09,"['Silver, gray or bare metal']",Cadent 1,460772,False,https://files.bikeindex.org/uploads/Pu/140095/...,Raleigh,u149k14722,True,"San Francisco,CA,94103",https://files.bikeindex.org/uploads/Pu/140095/...,2015 Raleigh Cadent 1,2015.0,94103
49,49,1.541192e+09,"['Silver, gray or bare metal']",Bike DB APEX,460776,False,https://files.bikeindex.org/uploads/Pu/140096/...,Diamondback,DAA16F000473,True,"San Diego,CA,92109",https://files.bikeindex.org/uploads/Pu/140096/...,2016 Diamondback Bike DB APEX,2016.0,92109
56,56,1.541185e+09,"['Blue', 'Blue']","19"" frame size kent bayside.",460699,False,,Kent,GS72696,True,"Santa Ana,CA,92705",,"Kent 19"" frame size kent bayside.",,92705
69,69,1.541131e+09,['Black'],Volare,69412,False,https://files.bikeindex.org/uploads/Pu/46648/l...,Schwinn,SNMNG 14C37721,True,"San Francisco,CA,94118",https://files.bikeindex.org/uploads/Pu/46648/s...,2014 Schwinn Volare,2014.0,94118


In [8]:
bike_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13313 entries, 33 to 99992
Data columns (total 15 columns):
Unnamed: 0           13313 non-null int64
date_stolen          13313 non-null float64
frame_colors         13313 non-null object
frame_model          12308 non-null object
id                   13313 non-null int64
is_stock_img         13313 non-null bool
large_img            8185 non-null object
manufacturer_name    13313 non-null object
serial               13282 non-null object
stolen               13313 non-null bool
stolen_location      13313 non-null object
thumb                8185 non-null object
title                13313 non-null object
year                 11010 non-null float64
stolen_zipcode       12881 non-null object
dtypes: bool(2), float64(2), int64(2), object(9)
memory usage: 1.4+ MB


In [9]:
bike_df_clean['date_stolen'] = pd.to_datetime(bike_df_clean['date_stolen'],unit='s')

In [10]:
bike_df_clean['year_stolen'] = bike_df_clean['date_stolen'].dt.year

In [11]:
bike_df_clean

Unnamed: 0.1,Unnamed: 0,date_stolen,frame_colors,frame_model,id,is_stock_img,large_img,manufacturer_name,serial,stolen,stolen_location,thumb,title,year,stolen_zipcode,year_stolen
33,33,2018-11-03 17:00:00,['Blue'],Cross-Check,462239,False,https://files.bikeindex.org/uploads/Pu/140320/...,Surly,YS-PC20270,True,"San Francisco,CA,94105",https://files.bikeindex.org/uploads/Pu/140320/...,2014 Surly Cross-Check,2014.0,94105,2018
39,39,2018-11-03 09:00:00,"['Red', 'Silver, gray or bare metal']",OCR 3,461486,False,https://files.bikeindex.org/uploads/Pu/140180/...,Giant,absent,True,"San Francisco,CA,94114",https://files.bikeindex.org/uploads/Pu/140180/...,2007 Giant OCR 3,2007.0,94114,2018
40,40,2018-11-03 09:00:00,"['Silver, gray or bare metal']",Thin 7,461723,False,https://files.bikeindex.org/uploads/Pu/140253/...,Sondors,MT17004959,True,"Berkeley,CA,94704",https://files.bikeindex.org/uploads/Pu/140253/...,Sondors Thin 7,,94704,2018
41,41,2018-11-03 05:00:00,['Black'],N/a,461764,False,https://files.bikeindex.org/uploads/Pu/140264/...,Not visible on bike,absent,True,"San Francisco,CA,94110",https://files.bikeindex.org/uploads/Pu/140264/...,Not visible on bike N/a,,94110,2018
42,42,2018-11-03 15:24:15,['White'],Lightweight 6061 Aluminum Frame,460962,False,,SXL,absent,True,"Los Angeles,CA,90007",,2018 SXL Lightweight 6061 Aluminum Frame,2018.0,90007,2018
47,47,2018-11-02 23:00:00,"['Silver, gray or bare metal']",Mountain,462623,False,,Genesis,no number,True,"Chico,CA,95973",,Genesis Mountain,,95973,2018
48,48,2018-11-02 21:00:44,"['Silver, gray or bare metal']",Cadent 1,460772,False,https://files.bikeindex.org/uploads/Pu/140095/...,Raleigh,u149k14722,True,"San Francisco,CA,94103",https://files.bikeindex.org/uploads/Pu/140095/...,2015 Raleigh Cadent 1,2015.0,94103,2018
49,49,2018-11-02 21:00:00,"['Silver, gray or bare metal']",Bike DB APEX,460776,False,https://files.bikeindex.org/uploads/Pu/140096/...,Diamondback,DAA16F000473,True,"San Diego,CA,92109",https://files.bikeindex.org/uploads/Pu/140096/...,2016 Diamondback Bike DB APEX,2016.0,92109,2018
56,56,2018-11-02 18:56:45,"['Blue', 'Blue']","19"" frame size kent bayside.",460699,False,,Kent,GS72696,True,"Santa Ana,CA,92705",,"Kent 19"" frame size kent bayside.",,92705,2018
69,69,2018-11-02 04:00:00,['Black'],Volare,69412,False,https://files.bikeindex.org/uploads/Pu/46648/l...,Schwinn,SNMNG 14C37721,True,"San Francisco,CA,94118",https://files.bikeindex.org/uploads/Pu/46648/s...,2014 Schwinn Volare,2014.0,94118,2018


I want to know how many years are represented in this data set. My zipcode data set recommends only using information from 2015 onwards.

In [12]:
# Check to see if the data is clean for year, as in, there are no repeat years or strange ways to notate the year
bike_df_clean['year_stolen'].unique()

array([2018, 2009, 2008, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010,
       2007, 2006, 2005, 2004, 2003, 2002, 2001, 2000, 1998, 1990])

In [13]:
year_stolen = []
year_stolen = bike_df_clean['year_stolen'].unique()
year_stolen.sort()
year_stolen

array([1990, 1998, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008,
       2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018])

In [14]:
bike_df_clean.groupby('year_stolen').count()

Unnamed: 0_level_0,Unnamed: 0,date_stolen,frame_colors,frame_model,id,is_stock_img,large_img,manufacturer_name,serial,stolen,stolen_location,thumb,title,year,stolen_zipcode
year_stolen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1990,1,1,1,1,1,1,0,1,1,1,1,0,1,1,0
1998,6,6,6,6,6,6,0,6,6,6,6,0,6,6,6
2000,20,20,20,17,20,20,3,20,20,20,20,3,20,18,20
2001,2,2,2,1,2,2,1,2,2,2,2,1,2,1,2
2002,6,6,6,6,6,6,1,6,6,6,6,1,6,5,6
2003,2,2,2,2,2,2,0,2,2,2,2,0,2,2,2
2004,10,10,10,10,10,10,0,10,10,10,10,0,10,8,10
2005,108,108,108,103,108,108,15,108,108,108,108,15,108,96,108
2006,149,149,149,145,149,149,14,149,149,149,149,14,149,136,149
2007,199,199,199,189,199,199,64,199,199,199,199,64,199,166,199


In [15]:
bike_df_clean

Unnamed: 0.1,Unnamed: 0,date_stolen,frame_colors,frame_model,id,is_stock_img,large_img,manufacturer_name,serial,stolen,stolen_location,thumb,title,year,stolen_zipcode,year_stolen
33,33,2018-11-03 17:00:00,['Blue'],Cross-Check,462239,False,https://files.bikeindex.org/uploads/Pu/140320/...,Surly,YS-PC20270,True,"San Francisco,CA,94105",https://files.bikeindex.org/uploads/Pu/140320/...,2014 Surly Cross-Check,2014.0,94105,2018
39,39,2018-11-03 09:00:00,"['Red', 'Silver, gray or bare metal']",OCR 3,461486,False,https://files.bikeindex.org/uploads/Pu/140180/...,Giant,absent,True,"San Francisco,CA,94114",https://files.bikeindex.org/uploads/Pu/140180/...,2007 Giant OCR 3,2007.0,94114,2018
40,40,2018-11-03 09:00:00,"['Silver, gray or bare metal']",Thin 7,461723,False,https://files.bikeindex.org/uploads/Pu/140253/...,Sondors,MT17004959,True,"Berkeley,CA,94704",https://files.bikeindex.org/uploads/Pu/140253/...,Sondors Thin 7,,94704,2018
41,41,2018-11-03 05:00:00,['Black'],N/a,461764,False,https://files.bikeindex.org/uploads/Pu/140264/...,Not visible on bike,absent,True,"San Francisco,CA,94110",https://files.bikeindex.org/uploads/Pu/140264/...,Not visible on bike N/a,,94110,2018
42,42,2018-11-03 15:24:15,['White'],Lightweight 6061 Aluminum Frame,460962,False,,SXL,absent,True,"Los Angeles,CA,90007",,2018 SXL Lightweight 6061 Aluminum Frame,2018.0,90007,2018
47,47,2018-11-02 23:00:00,"['Silver, gray or bare metal']",Mountain,462623,False,,Genesis,no number,True,"Chico,CA,95973",,Genesis Mountain,,95973,2018
48,48,2018-11-02 21:00:44,"['Silver, gray or bare metal']",Cadent 1,460772,False,https://files.bikeindex.org/uploads/Pu/140095/...,Raleigh,u149k14722,True,"San Francisco,CA,94103",https://files.bikeindex.org/uploads/Pu/140095/...,2015 Raleigh Cadent 1,2015.0,94103,2018
49,49,2018-11-02 21:00:00,"['Silver, gray or bare metal']",Bike DB APEX,460776,False,https://files.bikeindex.org/uploads/Pu/140096/...,Diamondback,DAA16F000473,True,"San Diego,CA,92109",https://files.bikeindex.org/uploads/Pu/140096/...,2016 Diamondback Bike DB APEX,2016.0,92109,2018
56,56,2018-11-02 18:56:45,"['Blue', 'Blue']","19"" frame size kent bayside.",460699,False,,Kent,GS72696,True,"Santa Ana,CA,92705",,"Kent 19"" frame size kent bayside.",,92705,2018
69,69,2018-11-02 04:00:00,['Black'],Volare,69412,False,https://files.bikeindex.org/uploads/Pu/46648/l...,Schwinn,SNMNG 14C37721,True,"San Francisco,CA,94118",https://files.bikeindex.org/uploads/Pu/46648/s...,2014 Schwinn Volare,2014.0,94118,2018


In [16]:
bike_df_final = bike_df_clean[bike_df_clean['year_stolen'].isin(['2018','2017','2016','2015'])]

In [17]:
bike_df_final['year_stolen'].unique()

array([2018, 2017, 2016, 2015])