# Authenticating With Google Drive


In [2]:
!pip install -U -q pyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

twisted 18.7.0 requires PyHamcrest>=1.9.0, which is not installed.
You are using pip version 10.0.1, however version 19.0.3 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


ModuleNotFoundError: No module named 'google.colab'

In [None]:
#authenticate
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Importing Dependencies.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


# Loading the Datasets from Google Drive

In [None]:
train_downloaded = drive.CreateFile({'id':'1zBZDF5jcG2PtUVisT33sihl2u0CEJClc'})
train_downloaded.GetContentFile('train.csv')
test_downloaded = drive.CreateFile({'id':'1OsLQL-95ZuISVkgvsW5h0WxEv3LzcsVK'})
test_downloaded.GetContentFile('test.csv')

In [None]:
df_train=pd.read_csv('train.csv')
df_train.head()

In [None]:
df_test = pd.read_csv('test.csv')
df_test.head()

In [None]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
df_train.describe()

# Getting Information about the features

In [None]:
df_train.info()

# Filtering out the dataset (Data Cleaning)

In [None]:
m=np.mean(df_train['trip_duration'])
s= np.std(df_train['trip_duration'])
df_train=df_train[df_train['trip_duration']<=m+2*s]
df_train=df_train[df_train['trip_duration']>=m-2*s]

In [None]:
df_train = df_train[df_train['pickup_longitude']<=-73.75]
df_train = df_train[df_train['pickup_longitude']>=-74.03]
df_train = df_train[df_train['pickup_latitude']<=40.85]
df_train = df_train[df_train['pickup_latitude']>=40.63]
df_train = df_train[df_train['dropoff_longitude']<=-73.75]
df_train = df_train[df_train['dropoff_longitude']>=-74.03]
df_train = df_train[df_train['dropoff_latitude']<=40.85]
df_train = df_train[df_train['dropoff_latitude']>=40.63]


# Preprocessing

In [None]:
df_train['pickup_datetime']=pd.to_datetime(df_train.pickup_datetime)
df_test['pickup_datetime']=pd.to_datetime(df_test.pickup_datetime)
df_train.loc[:,'pickup_date'] = df_train['pickup_datetime'].dt.date
df_test.loc[:,'pickup_date'] = df_test['pickup_datetime'].dt.date
df_train['dropoff_datetime']=pd.to_datetime(df_train.dropoff_datetime)

In [None]:
df_train.info()

# Getting idea about the dataset

In [None]:
plt.rcParams['figure.figsize'] = [16, 10]
plt.hist(df_train['trip_duration'].values,bins=100)
plt.xlabel('Trip_duration')
plt.ylabel('No_of_trips')
plt.show()


In [None]:
df_train['log_trip_duration']=np.log(df_train['trip_duration'].values+1)
plt.hist(df_train['log_trip_duration'].values , bins=100)
plt.xlabel('log(trip_duration)')
plt.ylabel('no_of_observations')
plt.show()
sns.distplot(df_train['log_trip_duration'],bins=100)

# Comparing and visualising training and test data.

In [None]:
plt.plot(df_train.groupby('pickup_date').count()[['id']], '-o',label = 'train')
plt.plot(df_test.groupby('pickup_date').count()[['id']] , '-o' ,label = 'test')
plt.title('Trips Per Day')
plt.legend(loc=1)
plt.ylabel('Trips')
plt.show()

In [None]:
import warnings
warnings.filterwarnings('ignore')
plot_vendor=df_train.groupby('vendor_id')['trip_duration'].mean()
print(plot_vendor)
plt.subplots(1,1,figsize=(15,10))
plt.ylim(ymin=800)
plt.ylim(ymax = 840)
sns.barplot(plot_vendor.index,plot_vendor.values)
plt.title('Time Per Vendor')
#plt.legend(loc=1)
plt.ylabel('Time in Seconds')

In [None]:
snwflag=df_train.groupby('store_and_fwd_flag')['trip_duration'].mean()
plt.subplots(1,1,figsize=(17,10))
plt.ylim(ymin=800)
plt.ylim(ymax=1100)
plt.title('Time per store_and_fwd_flag')
plt.ylabel('Time in seconds')
sns.barplot(snwflag.index,snwflag.values)

In [None]:
pc = df_train.groupby('passenger_count')['trip_duration'].mean()
plt.subplots(1,1,figsize=(17,10))
plt.ylabel('Time in seconds')

plt.ylim(ymax = 1100)
sns.barplot(pc.index,pc.values)

In [None]:
df_train.groupby('passenger_count').size()

In [None]:
df_test.groupby('passenger_count').size()