In [111]:
import pandas as pd               

#defining iqr
def IQR(column): 
    q25, q75 = column.quantile([0.25, 0.75])
    return q75-q25

In [112]:
london = pd.read_csv('data/10-11_London_Bridge.txt')

# taking the required columns and renaming column names
df = london.drop(columns= london.columns[3])
df.columns = ['datetime', 'water_level', 'is_high_tide']

#observing types
df.dtypes

datetime        object
water_level     object
is_high_tide     int64
dtype: object

In [113]:
#converting datetime column, also adding year column for further analysis
df['datetime'] = pd.to_datetime(df['datetime'])
df['year'] = df['datetime'].dt.year

#converting water_level column
df['water_level'] = df['water_level'].astype(float)

In [114]:
# Finding the mean, median, and interquartile range for high- and low-tide data and saving them as two separate pandas Series.

high = df[df['is_high_tide'] == 1]
low = df[df['is_high_tide'] == 0]


high_statistics = high['water_level'].agg(['mean', 'median', IQR])
low_statistics = low['water_level'].agg(['mean', 'median', IQR])

# calculating the difference
diff = (high_values - low_values).round(2)
diff

IQR       0.21
mean      5.70
median    5.77
Name: water_level, dtype: float64

In [115]:
# Calculating the annual percentage of days with very high tide levels (90th percentile of high tide days) for each year and storing the results as floats in a two-column DataFrame.

# grouping data

all_high = high.groupby('year')['water_level'].count()
very_high = high[high['water_level'] > high['water_level'].quantile(0.9)].groupby('year')['water_level'].count()
very_high_ratio = (very_high / all_high).reset_index()

# displaying years with highest water level 
very_high_ratio = very_high_ratio.rename(columns= {'water_level': 'very_high_ratio'})
very_high_ratio.sort_values(by= 'very_high_ratio', ascending= False).head().round(3).set_index('year')


Unnamed: 0_level_0,very_high_ratio
year,Unnamed: 1_level_1
1960,0.211
1961,0.171
1995,0.17
1958,0.165
1979,0.161


In [116]:
# Repeating the same process for low tide level days, this time with 10th percentile.

all_low = low.groupby('year')['water_level'].count()
very_low = low[low['water_level'] < low['water_level'].quantile(0.1)].groupby('year')['water_level'].count()
very_low_ratio = (very_low / all_low).reset_index()


# displaying years with lowest water level
very_low_ratio = very_low_ratio.rename(columns= {'water_level' : 'very_low_ratio'})
very_low_ratio.sort_values(by= 'very_low_ratio').head().round(3).set_index('year')

Unnamed: 0_level_0,very_low_ratio
year,Unnamed: 1_level_1
1913,0.022
1915,0.033
1914,0.039
1951,0.043
1969,0.045


In [117]:
# Creating a dictionary named solution with a summary of your data analysis.
solution = {'high_statistics': high_statistics, 'low_statistics': low_statistics, 'very_high_ratio': very_high_ratio, 'very_low_ratio':very_low_ratio}
print(solution)

{'high_statistics': mean      3.318373
median    3.352600
IQR       0.743600
Name: water_level, dtype: float64, 'low_statistics': mean     -2.383737
median   -2.412900
IQR       0.538200
Name: water_level, dtype: float64, 'very_high_ratio':     year  very_high_ratio
0   1911         0.004098
1   1912         0.032316
2   1913         0.082212
3   1914         0.055313
4   1915         0.045045
..   ...              ...
80  1991         0.096317
81  1992         0.103253
82  1993         0.145923
83  1994         0.150355
84  1995         0.170213

[85 rows x 2 columns], 'very_low_ratio':     year  very_low_ratio
0   1911        0.060606
1   1912        0.066667
2   1913        0.022388
3   1914        0.039017
4   1915        0.033435
..   ...             ...
80  1991        0.150355
81  1992        0.107496
82  1993        0.112696
83  1994        0.106383
84  1995        0.107801

[85 rows x 2 columns]}
