## Exploratory Data analysis

In [15]:
# Libraries to help with reading and manipulating data
import pandas as pd
import numpy as np

------------------------------------
## **Uber Dataset Description**
------------------------------------

The data contains information about the weather, location, and pickups.

* pickup_dt: Date and time of the pick-up
* borough: NYC's borough
* pickups: Number of pickups for the period (1 hour)
* spd: Wind speed in miles/hour
* vsb: Visibility in miles to the nearest tenth
* temp: Temperature in Fahrenheit
* dewp: Dew point in Fahrenheit
* slp: Sea level pressure
* pcp01: 1-hour liquid precipitation
* pcp06: 6-hour liquid precipitation
* pcp24: 24-hour liquid precipitation
* sd: Snow depth in inches
* hday: Being a holiday (Y) or not (N)

In [16]:
data = pd.read_csv('uber.csv')

In [17]:
data.head()

Unnamed: 0,pickup_dt,borough,pickups,spd,vsb,temp,dewp,slp,pcp01,pcp06,pcp24,sd,hday
0,2015-01-01 01:00:00,Bronx,152,5.0,10.0,30.0,7.0,1023.5,0.0,0.0,0.0,0.0,Y
1,2015-01-01 01:00:00,Brooklyn,1519,5.0,10.0,30.0,7.0,1023.5,0.0,0.0,0.0,0.0,Y
2,2015-01-01 01:00:00,EWR,0,5.0,10.0,30.0,7.0,1023.5,0.0,0.0,0.0,0.0,Y
3,2015-01-01 01:00:00,Manhattan,5258,5.0,10.0,30.0,7.0,1023.5,0.0,0.0,0.0,0.0,Y
4,2015-01-01 01:00:00,Queens,405,5.0,10.0,30.0,7.0,1023.5,0.0,0.0,0.0,0.0,Y


In [18]:
data.tail()

Unnamed: 0,pickup_dt,borough,pickups,spd,vsb,temp,dewp,slp,pcp01,pcp06,pcp24,sd,hday
29096,2015-06-30 23:00:00,EWR,0,7.0,10.0,75.0,65.0,1011.8,0.0,0.0,0.0,0.0,N
29097,2015-06-30 23:00:00,Manhattan,3828,7.0,10.0,75.0,65.0,1011.8,0.0,0.0,0.0,0.0,N
29098,2015-06-30 23:00:00,Queens,580,7.0,10.0,75.0,65.0,1011.8,0.0,0.0,0.0,0.0,N
29099,2015-06-30 23:00:00,Staten Island,0,7.0,10.0,75.0,65.0,1011.8,0.0,0.0,0.0,0.0,N
29100,2015-06-30 23:00:00,,3,7.0,10.0,75.0,65.0,1011.8,0.0,0.0,0.0,0.0,N


In [19]:
data.describe()

Unnamed: 0,pickups,spd,vsb,temp,dewp,slp,pcp01,pcp06,pcp24,sd
count,29101.0,29101.0,29101.0,29101.0,29101.0,29101.0,29101.0,29101.0,29101.0,29101.0
mean,490.215903,5.984924,8.818125,47.669042,30.823065,1017.817938,0.00383,0.026129,0.090464,2.529169
std,995.649536,3.699007,2.442897,19.814969,21.283444,7.768796,0.018933,0.093125,0.219402,4.520325
min,0.0,0.0,0.0,2.0,-16.0,991.4,0.0,0.0,0.0,0.0
25%,1.0,3.0,9.1,32.0,14.0,1012.5,0.0,0.0,0.0,0.0
50%,54.0,6.0,10.0,46.0,30.0,1018.2,0.0,0.0,0.0,0.0
75%,449.0,8.0,10.0,64.5,50.0,1022.9,0.0,0.0,0.05,2.958333
max,7883.0,21.0,10.0,89.0,73.0,1043.4,0.28,1.24,2.1,19.0


In [20]:
data.describe(exclude='number')

Unnamed: 0,pickup_dt,borough,hday
count,29101,26058,29101
unique,4343,6,2
top,2015-01-01 01:00:00,Bronx,N
freq,7,4343,27980


In [21]:
data.describe(include='all')

Unnamed: 0,pickup_dt,borough,pickups,spd,vsb,temp,dewp,slp,pcp01,pcp06,pcp24,sd,hday
count,29101,26058,29101.0,29101.0,29101.0,29101.0,29101.0,29101.0,29101.0,29101.0,29101.0,29101.0,29101
unique,4343,6,,,,,,,,,,,2
top,2015-01-01 01:00:00,Bronx,,,,,,,,,,,N
freq,7,4343,,,,,,,,,,,27980
mean,,,490.215903,5.984924,8.818125,47.669042,30.823065,1017.817938,0.00383,0.026129,0.090464,2.529169,
std,,,995.649536,3.699007,2.442897,19.814969,21.283444,7.768796,0.018933,0.093125,0.219402,4.520325,
min,,,0.0,0.0,0.0,2.0,-16.0,991.4,0.0,0.0,0.0,0.0,
25%,,,1.0,3.0,9.1,32.0,14.0,1012.5,0.0,0.0,0.0,0.0,
50%,,,54.0,6.0,10.0,46.0,30.0,1018.2,0.0,0.0,0.0,0.0,
75%,,,449.0,8.0,10.0,64.5,50.0,1022.9,0.0,0.0,0.05,2.958333,


In [22]:
data.shape

(29101, 13)

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29101 entries, 0 to 29100
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pickup_dt  29101 non-null  object 
 1   borough    26058 non-null  object 
 2   pickups    29101 non-null  int64  
 3   spd        29101 non-null  float64
 4   vsb        29101 non-null  float64
 5   temp       29101 non-null  float64
 6   dewp       29101 non-null  float64
 7   slp        29101 non-null  float64
 8   pcp01      29101 non-null  float64
 9   pcp06      29101 non-null  float64
 10  pcp24      29101 non-null  float64
 11  sd         29101 non-null  float64
 12  hday       29101 non-null  object 
dtypes: float64(9), int64(1), object(3)
memory usage: 2.9+ MB


In [24]:
data.value_counts()

pickup_dt            borough        pickups  spd  vsb   temp  dewp  slp     pcp01  pcp06  pcp24  sd         hday
2015-01-01 01:00:00  Bronx          152      5.0  10.0  30.0  7.0   1023.5  0.0    0.0    0.0    0.000000   Y       1
2015-05-01 15:00:00  Queens         513      9.0  10.0  55.0  38.0  1015.6  0.0    0.0    0.0    0.000000   N       1
2015-05-01 17:00:00  EWR            0        3.0  10.0  60.0  40.0  1014.8  0.0    0.0    0.0    0.000000   N       1
                     Brooklyn       548      3.0  10.0  60.0  40.0  1014.8  0.0    0.0    0.0    0.000000   N       1
                     Bronx          84       3.0  10.0  60.0  40.0  1014.8  0.0    0.0    0.0    0.000000   N       1
                                                                                                                   ..
2015-03-02 08:00:00  Brooklyn       496      6.0  7.0   30.0  24.0  1017.8  0.0    0.0    0.0    10.625000  N       1
                     Bronx          59       6.0  7.0   30.0 

In [25]:
data.value_counts(dropna=False)

pickup_dt            borough        pickups  spd  vsb   temp  dewp  slp     pcp01  pcp06  pcp24  sd         hday
2015-01-01 01:00:00  Bronx          152      5.0  10.0  30.0  7.0   1023.5  0.0    0.0    0.0    0.000000   Y       1
2015-05-02 11:00:00  Bronx          67       0.0  10.0  49.0  42.0  1015.6  0.0    0.0    0.0    0.000000   N       1
2015-05-02 14:00:00  Staten Island  5        0.0  10.0  58.0  40.0  1015.4  0.0    0.0    0.0    0.000000   N       1
                     Queens         358      0.0  10.0  58.0  40.0  1015.4  0.0    0.0    0.0    0.000000   N       1
                     Manhattan      2727     0.0  10.0  58.0  40.0  1015.4  0.0    0.0    0.0    0.000000   N       1
                                                                                                                   ..
2015-03-02 22:00:00  EWR            0        7.0  10.0  38.0  11.0  1021.6  0.0    0.0    0.0    13.541667  N       1
                     Brooklyn       442      7.0  10.0  38.0 

In [31]:
df.isnull().sum()
#data.dropna(inplace= True) 

pickup_dt       0
borough      3043
pickups         0
spd             0
vsb             0
temp            0
dewp            0
slp             0
pcp01           0
pcp06           0
pcp24           0
sd              0
hday            0
dtype: int64

In [29]:
def missing_check(df):
    total = df.count()
    missing = df.isnull().sum().sort_values(ascending=False)   # total number of null values
    percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)  # percentage of values that are null
    missing_data = pd.concat([total, missing, percent], axis=1, keys=['Total', 'Missing', 'Percent'])  # putting the above two together
    return missing_data # return the dataframe
missing_check(data)

Unnamed: 0,Total,Missing,Percent
pickup_dt,29101,0,0.0
borough,26058,3043,0.104567
pickups,29101,0,0.0
spd,29101,0,0.0
vsb,29101,0,0.0
temp,29101,0,0.0
dewp,29101,0,0.0
slp,29101,0,0.0
pcp01,29101,0,0.0
pcp06,29101,0,0.0
