# NYC Yellow Taxi Growth

#### The objectives of this program:
Using the data set that comes from NYC open data of **NYC Yellow Taxi Service** and **Uber Taxi Service** we would like to **analyze the growth** of the NYC Yellow Taxi Service by using the amount of pickups done daily.

We will be using correlation between taxi and uber in respective borough to observe the **correlation between the growth rate** of both taxi and uber to help understand how competition impacts pickup amounts. We will also use statistics to help **predict the growth** done within the next few days. We will be using **histograms and line charts** to demonstrate the growth.

In [3]:
sc

<pyspark.context.SparkContext at 0x6cfef28>

In [4]:
# imports
import pandas as pd
import numpy as np
from geopy.distance import great_circle
import matplotlib.dates as mpd
import matplotlib.pyplot as plt
import datetime
import csv

## Taxi vs Uber

In [None]:
taxi_aug14 = sc.textFile('../yellow_tripdata_2014-08.csv'   , use_unicode=False).filter(lambda x: x != "").cache()
uber_aug14 = sc.textFile('../uber-raw-data-aug14.csv', use_unicode=False).cache()
# list(enumerate(taxi_aug14.first().split(',')))
# list(enumerate(uber_aug14.first().split(',')))

### Borough Estimation

We will be using midpoints of each respective boughs to where the coordinates of the bus data lies.  
We will use the a default distance of 20 miles as the maximum value away from the midterm (that is Staten Island or Manhattan) ends up being around 20 miles.  
If both borough overlap we will use the closest distance. Otherwise, if it doesn't fall in borough distance we will consider it to be outside NYC.

In [1]:
import reverse_geocoder as rg
def getlocation(x):
    point_coord = (float(x[0]) , float(x[1]))
    this_location = rg.search(point_coord, mode = 1)
    print(this_location)
    if ( this_location[0]['admin2'] == 'Queens County'):
        return 'Queens'
    else :
        return this_location[0]['name']

### Data Ingestion for August 2014 Taxi Data

We will clean the data into:  
```((boro, pickup, 1)```  
Then we will group this data as:  
```[boro [(date, count)]]```

In [None]:
def extractTaxi(partId, records):
    if partId==0:
        records.next()
    reader = csv.reader(records)
    for row in reader:
        (pickup, boro) = (row[1].split(" ")[0], getlocation((row[6],row[5])))
        if boro in ['The Bronx', 'Brooklyn', 'Queens', 'Manhattan', 'Staten Island']:
            yield ((boro, pickup) , 1)
        continue

trdd = taxi_aug14.mapPartitionsWithIndex(extractTaxi)\
                .reduceByKey(lambda x, y: x+y)\
                .sortBy(lambda x:(x[0][0], x[0][1]))\
                .map(lambda x: (x[0][0], [(x[0][1], x[1])]))\
                .reduceByKey(lambda x, y: (x+y))
#trdd.take(10)

### Data Ingestion for August 2014 Uber Data

Just like the taxi data, we will format the Uber data just the same.

In [None]:
def extractUber(partId, records):
    if partId==0:
        records.next()
    import csv
    reader = csv.reader(records)
    for row in reader:
        (pickup, boro) = (row[0].split(" ")[0],  getlocation((row[1],row[2])))
        if boro in ['The Bronx', 'Brooklyn', 'Queens', 'Manhattan', 'Staten Island']:
            yield ((boro, pickup) , 1)
        continue

urdd = uber_aug14.mapPartitionsWithIndex(extractUber)\
                .reduceByKey(lambda x, y: x+y)\
                .map(lambda x: ((x[0][0], datetime.datetime.strptime(x[0][1], "%m/%d/%Y").strftime("%Y-%m-%d")), x[1]))\
                .sortBy(lambda x:(x[0][0], x[0][1]))\
                .map(lambda x: (x[0][0], [(x[0][1], x[1])]))\
                .reduceByKey(lambda x, y: (x+y))             
#urdd.take(10)

##### This function will get data from either of the dataset

In [None]:
# gets data given a key
def get_data(data, key):
    # returns ALL values
    if key == -1:
        return data.values().map(lambda x: list(zip(*x)[1])).collect()
    # returns ALL dates
    if key == -2:
        return data.values().map(lambda x: zip(*x)[0]).collect()[4]
    data = zip(*data.collect()[key][1])[1]
    if data:
        return data
    print "None found"
    return []
# don't use index 2 in the actual database because it is 'outside_nyc'
lboro = ['Brooklyn', 'Queens', 'Staten Island', 'Manhattan', 'Bronx']

### Scatter Line Graph for Taxi vs Uber

In [None]:
def make_scatter(time, tvalues, uvalues, key):
    plt.plot(time, tvalues,'-o', c ='blue', label = 'taxi')
    plt.plot(time, uvalues, '-o', c='red', label = 'uber')
    plt.title(lboro[key])
    plt.legend(loc='best')
    plt.xlabel('days')
    plt.ylabel('pickups')
    plt.show()

#### We will first make a scatter to observe the data

In [None]:
datetimes = list(range(len(get_data(urdd, 0))))

for i in range(6):
    if i != 2:
        make_scatter(datetimes, get_data(trdd, i), get_data(urdd, i), i)

### Cumilative Line Graph for Taxi vs Uber

In [None]:
uvb = [[],[],[],[],[]]
tvb = [[],[],[],[],[]]
# evaluate the histogram
tvb[0] = list(np.histogram(get_data(trdd, 0), bins=30))
tvb[1] = list(np.histogram(get_data(trdd, 1), bins=30))
tvb[2] = list(np.histogram(get_data(trdd, 3), bins=30))
tvb[3] = list(np.histogram(get_data(trdd, 4), bins=30))
tvb[4] = list(np.histogram(get_data(trdd, 5), bins=30))

uvb[0] = list(np.histogram(get_data(urdd, 0), bins=30))
uvb[1] = list(np.histogram(get_data(urdd, 1), bins=30))
uvb[2] = list(np.histogram(get_data(urdd, 3), bins=30))
uvb[3] = list(np.histogram(get_data(urdd, 4), bins=30))
uvb[4] = list(np.histogram(get_data(urdd, 5), bins=30))

#evaluate the cumulative
tvb[0][0] = np.cumsum(tvb[0][0])
tvb[1][0] = np.cumsum(tvb[1][0])
tvb[2][0] = np.cumsum(tvb[2][0])
tvb[3][0] = np.cumsum(tvb[3][0])
tvb[4][0] = np.cumsum(tvb[4][0])

uvb[0][0] = np.cumsum(uvb[0][0])
uvb[1][0] = np.cumsum(uvb[1][0])
uvb[2][0] = np.cumsum(uvb[2][0])
uvb[3][0] = np.cumsum(uvb[3][0])
uvb[4][0] = np.cumsum(uvb[4][0])

In [None]:
# plot the cumulative functions
def make_accum_graph(key):
    if key < 2:
        plt.plot(tvb[key][0], tvb[key][1][:-1], c='blue', label = 'taxi')
        plt.plot(uvb[key][0], uvb[key][1][:-1], c='red', label = 'uber')
        plt.title(lboro[key])
        plt.legend(loc='best')
        plt.xlabel('days')
        plt.ylabel('pickups')
        plt.show()
    elif key > 2:
        plt.plot(tvb[key-1][0], tvb[key-1][1][:-1], c='blue', label = 'taxi')
        plt.plot(uvb[key-1][0], uvb[key-1][1][:-1], c='red', label = 'uber')
        plt.title(lboro[key])
        plt.legend(loc='best')
        plt.xlabel('days')
        plt.ylabel('pickups')
        plt.show()
    
for i in range(6):
    make_accum_graph(i)

## Growth Rate Comparison

Then we will make growth rate comparison using histograph of boro based in seperate squares to get a better idea of increase and decrease correlation for both statistics.

In [None]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import functions as F
from pyspark.sql.window import Window

In [None]:
def get_growth(data, key):
    sqlc = SQLContext(sc)

    data = sc.parallelize(data.collect()[key][1])
    df = sqlc.createDataFrame(data, ["date", "value"])
    my_window = Window.partitionBy().orderBy("date")

    df = df.withColumn("prev_value", F.lag(df.value).over(my_window))
    df = df.withColumn("diff", F.when(F.isnull(((df.value - df.prev_value)/df.prev_value)*100), 0)
                                  .otherwise((df.value - df.prev_value)/df.prev_value)*100)
    return df.rdd.map(lambda x: x.date.encode("utf-8")).collect(), df.rdd.map(lambda x: x.diff).collect()

In [None]:
def make_growth_chart(time, tvalues, uvalues, key):
    plt.plot(time, tvalues,'-o', c ='blue', label = 'taxi')
    plt.plot(time, uvalues, '-o', c='red', label = 'uber')
    plt.axhline(linewidth=3, ls = 'dashed')
    plt.title(lboro[key])
    plt.legend(loc='best')
    plt.xlabel('days')
    plt.ylabel('pickups')
    plt.show()

In [None]:
datetimes = list(i for i in range(len(get_data(urdd, i))))

for i in range(6):
    if i != 2:
        make_growth_chart(datetimes, get_growth(trdd, i)[1], get_growth(urdd, i)[1], i)

### Coefficient Correlation

In [None]:
def make_coeff(slope, intercept, x, y, key):
    plt.plot(x, y, 'o', label = 'uber/taxi')
    plt.plot(x, intercept+ slope*x, '-', label = 'correlation' )
    plt.title(lboro[key])
    plt.legend(loc='best')
    plt.xlabel('taxi')
    plt.ylabel('uber')
    plt.show()

In [None]:
from scipy import stats
import numpy as np

l = list(range(len(get_data(urdd, 0))))

for i in range(6):
    if i != 2:
        #x = scipy.array(get_data(trdd, i))
        x = scipy.array(l)
        y = scipy.array(get_data(urdd, i))
        slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
        make_coeff(slope, intercept, x, y, i)
        print 'r-value: ', r_value
        print 'p-value: ', p_value

## Prediction

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split

In [9]:
columns = ['boro', 'days_in_month', 'number_of_pickup']
dic_boro = {'bronx': 0, 'brooklyn': 1, 'manhattan': 2, 'outside_nyc': 3, 'queens': 4, 'staten': 5}

In [10]:
Urdd_table = taxi_aug14.mapPartitionsWithIndex(extractUber)\
                .reduceByKey(lambda x, y: x+y)\
                .map(lambda x: ((x[0][0], datetime.datetime.strptime(x[0][1], "%m/%d/%Y").strftime("%Y-%m-%d")), x[1]))\
                .sortBy(lambda x:(x[0][0], x[0][1]))\
                .map(lambda x: (dic_boro[x[0][0]], int(datetime.datetime.strptime(x[0][1], "%Y-%m-%d").strftime("%d")),x[1]))
                               # (x[1]-min_boro[dic_boro[x[0][0]]])/(max_boro[dic_boro[x[0][0]]]- min_boro[dic_boro[x[0][0]]])))

In [11]:
df_urdd = Urdd_table.collect()
'''df = pd.DataFrame(Urdd_table.collect(), columns=columns)
df_boro = [[],[],[],[],[],[]]
for i in range(6):
    for j in range(len(df)):
        if df['boro'][j] == i:
            df_boro[i].append(df.loc[j].values)
'''


"df = pd.DataFrame(Urdd_table.collect(), columns=columns)\ndf_boro = [[],[],[],[],[],[]]\nfor i in range(6):\n    for j in range(len(df)):\n        if df['boro'][j] == i:\n            df_boro[i].append(df.loc[j].values)\n"

In [12]:
df_boro = [[],[],[],[],[],[]]
for i in range(6):
    for item in df_urdd:
        if item[0] == i:
            df_boro[i].append(item)
for i in range(6):
    df_boro[i] = pd.DataFrame(df_boro[i], columns=columns)

In [17]:
X = [[],[],[],[],[],[]]
y =[[],[],[],[],[],[]]
X_train=[[],[],[],[],[],[]]
X_test=[[],[],[],[],[],[]]
y_train=[[],[],[],[],[],[]] 
y_test=[[],[],[],[],[],[]]

for i in range(6):
    X[i] = df_boro[i][['boro', 'days_in_month']].values
    y[i] = df_boro[i][['number_of_pickup']].values
    X_train[i], X_test[i], y_train[i], y_test[i] = train_test_split(X[i], y[i], test_size=0.4, random_state=1)
    print(X_train[i].shape, X_validation[i].shape, X_test[i].shape)

AttributeError: 'list' object has no attribute 'shape'

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1)
X_validation, X_test, y_validation, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=1)
print(X_train.shape, X_validation.shape, X_test.shape)

AttributeError: 'list' object has no attribute 'shape'

### Cumilative Line Graph for Taxi vs Uber

In [20]:
linear_r = LinearRegression()
lasso_r = Lasso()
ridge_r = Ridge(alpha=1.0)
elastic_r = ElasticNet(alpha=1, l1_ratio=0.5)

In [21]:
for i in range(6):
    linear_r.fit(X_train[i], y_train[i])
    lasso_r.fit(X_train[i], y_train[i])
    ridge_r.fit(X_train[i], y_train[i])
    elastic_r.fit(X_train[i], y_train[i])
    print(lasso_r.predict(df_boro[i][['boro', 'days_in_month']].values))
'''    print(' accuracy of Linear regression: ',
      linear_r.score(X_test[i], y_test[i]),
      'accuracy of Lasso: ',
      lasso_r.score(X_test[i], y_test[i]),
      'accuracy of Ridge regression: ',
      ridge_r.score(X_test[i], y_test[i]),
      'accuracy of Elastic net: ',
      elastic_r.score(X_test[i], y_test[i])
     )'''

[370.65725806 372.57903226 374.50080645 376.42258065 378.34435484
 380.26612903 382.18790323 384.10967742 386.03145161 387.95322581
 389.875      391.79677419 393.71854839 395.64032258 397.56209677
 399.48387097 401.40564516 403.32741935 405.24919355 407.17096774
 409.09274194 411.01451613 412.93629032 414.85806452 416.77983871
 418.7016129  420.6233871  422.54516129 424.46693548 426.38870968
 428.31048387]


ValueError: Expected 2D array, got 1D array instead:
array=[].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.