In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from scipy.stats import skew
from scipy.stats import kurtosis
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
stations_df = pd.read_csv('../data/201508_station_data.csv')

In [3]:
stations_df.head()

Unnamed: 0,station_id,name,lat,long,dockcount,landmark,installation
0,2,San Jose Diridon Caltrain Station,37.329732,-121.901782,27,San Jose,8/6/2013
1,3,San Jose Civic Center,37.330698,-121.888979,15,San Jose,8/5/2013
2,4,Santa Clara at Almaden,37.333988,-121.894902,11,San Jose,8/6/2013
3,5,Adobe on Almaden,37.331415,-121.8932,19,San Jose,8/5/2013
4,6,San Pedro Square,37.336721,-121.894074,15,San Jose,8/7/2013


In [4]:
stations_df['installation'] = pd.to_datetime(stations_df['installation'])

In [5]:
stations_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70 entries, 0 to 69
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   station_id    70 non-null     int64         
 1   name          70 non-null     object        
 2   lat           70 non-null     float64       
 3   long          70 non-null     float64       
 4   dockcount     70 non-null     int64         
 5   landmark      70 non-null     object        
 6   installation  70 non-null     datetime64[ns]
dtypes: datetime64[ns](1), float64(2), int64(2), object(2)
memory usage: 4.0+ KB


In [6]:
stations_df_wo_id = stations_df.drop(axis=1, columns=['station_id'])

In [7]:
stations_df_wo_id.describe()

Unnamed: 0,lat,long,dockcount
count,70.0,70.0,70.0
mean,37.590243,-122.218416,17.657143
std,0.203473,0.209446,4.010442
min,37.329732,-122.418954,11.0
25%,37.389483,-122.400601,15.0
50%,37.631163,-122.312123,15.0
75%,37.788123,-122.078009,19.0
max,37.80477,-121.877349,27.0


Univariate analysis

In [8]:
il = lambda q1, iqr: q1 - 1.5*iqr
sl = lambda q3, iqr: q3 + 1.5*iqr
iqr = lambda q1, q3: q3-q1

In [9]:
def get_stats_from_column(data_frame: pd.DataFrame, column: str, hist:bool = False) -> None:
    description = data_frame[column].describe()
    iqr_value = iqr(description[4], description[6])
    il_value = il(description[4], iqr_value)
    sl_value = sl(description[6], iqr_value)
    var_value = data_frame[column].var()
    std_value = data_frame[column].std()
    skewness_value = skew(data_frame[column])
    kurtosis_value = kurtosis(data_frame[column])
    print(f'Measures of central tendency: \n\n{description}')

    print('+----------------------------------------------------------------------------------------------------------------------------------+')
    print ("|{:<20} | {:<25} | {:<10} | {:<20} | {:<20} | {:<20}|".format('inferior limit', 'superior limit', 'IQR', 'var', 'std', 'Skewness', 'Kurtosis'))
    print('+----------------------------------------------------------------------------------------------------------------------------------+')

    print ("|{:<20} | {:<25} | {:<10} | {:<20} | {:<20} | {:<20}|"
    .format(il_value,sl_value,iqr_value, var_value, std_value, skewness_value, kurtosis_value))
    print('+----------------------------------------------------------------------------------------------------------------------------------+')
    if hist:
        title = "Histogram for {columnName}".format(columnName=column)
        fig = px.histogram(data_frame, x=column, title = title)
        fig.show()
        fig2 = px.box(data_frame, y=column, title="Boxplot for "+column)
        fig2.show()
    else:
        fig = px.box(data_frame, y=column, title="Boxplot for "+column)
        fig.show()

In the next section is possible to see different stats. We have a positive skweness which it is also shows in the histogram of the variable, the most amount of data are to the left of the mean which is 17.65 with a standard deviation of 4.01 more or less. 

In [10]:
get_stats_from_column(stations_df_wo_id, 'dockcount', True)

Measures of central tendency: 

count    70.000000
mean     17.657143
std       4.010442
min      11.000000
25%      15.000000
50%      15.000000
75%      19.000000
max      27.000000
Name: dockcount, dtype: float64
+----------------------------------------------------------------------------------------------------------------------------------+
|inferior limit       | superior limit            | IQR        | var                  | std                  | Skewness            |
+----------------------------------------------------------------------------------------------------------------------------------+
|9.0                  | 25.0                      | 4.0        | 16.08364389233954    | 4.010441857493952    | 0.7369248035176991  |
+----------------------------------------------------------------------------------------------------------------------------------+
