# Task 0: Visualize our parquet data frames.

In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns 

In [None]:
df1 = pd.read_parquet('GlobalLandTemperaturesByState.parquet')
df2 = pd.read_parquet('GlobalLandTemperaturesByCountry.parquet')
df3 = pd.read_parquet('GlobalTemperatures.parquet')

In [None]:
df1

In [None]:
df2

In [None]:
df3

# Task 5: Visualize our data distribution between different countries.

In [None]:
df4 = pd.read_parquet('Acre.parquet')

In [None]:
df4

In [None]:
city_df = pd.read_parquet('state_cnt.parquet')
country_df = pd.read_parquet('country_cnt.parquet')

In [None]:
city_df

In [None]:
country_df

In [None]:
country_df.groupby('Country').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)

# Task 7: Visualize the result after handle Handle null or NaN value.

In [None]:
def distribution_plot(df,col):
    figsize = (12, 1.2 * len(df['Country'].unique()))
    plt.figure(figsize=figsize)
    sns.violinplot(df, x=col, y='Country', inner='box', palette='Dark2')
    sns.despine(top=True, right=True, bottom=True, left=True)
    
def _plot_series(series, series_name, col1, col2, series_index=0):
    palette = list(sns.palettes.mpl_palette('Dark2'))
    xs = series[col1]
    ys = series[col2]
    plt.plot(xs, ys, label=series_name, color=palette[series_index % len(palette)])

def time_series(df, col1, col2):
    fig, ax = plt.subplots(figsize=(10, 5.2), layout='constrained')
    df_sorted = df.sort_values(col1, ascending=True)
    _plot_series(df_sorted, '', col1, col2)
    sns.despine(fig=fig, ax=ax)
    plt.xlabel(col1)
    _ = plt.ylabel(col2)

def missing_cnt(df):
    nan_cnt = df.isnull().sum().sum()
    print('Number of NaN values:', nan_cnt)

def country_weather(df):
  fig, ax = plt.subplots(figsize=(10, 5.2), layout='constrained')
  df_sorted = df.sort_values('dt', ascending=True)
  for i, (series_name, series) in enumerate(df_sorted.groupby('Country')):
    _plot_series(series, series_name,'dt','AverageTemperatureUncertainty', i)
    fig.legend(title='Country', bbox_to_anchor=(1, 1), loc='upper left')
  sns.despine(fig=fig, ax=ax)
  plt.xlabel('dt')
  _ = plt.ylabel('AverageTemperatureUncertainty')

In [None]:
df5 = pd.read_parquet('missing_cnt.parquet')
df6 = pd.read_parquet('forward_fill.parquet')

In [None]:
missing_cnt(df1)

In [None]:
missing_cnt(df5)

In [None]:
missing_cnt(df6)

In [None]:
distribution_plot(df5, 'AverageTemperature')

In [None]:
distribution_plot(df6, 'AverageTemperature')

In [None]:
distribution_plot(df5, 'AverageTemperatureUncertainty')

In [None]:
distribution_plot(df6, 'AverageTemperatureUncertainty')

In [None]:
country_weather(df5)

In [None]:
country_weather(df6)

In [None]:
time_series(df5,'dt', 'AverageTemperatureUncertainty')

In [None]:
time_series(df6,'dt', 'AverageTemperatureUncertainty')

In [None]:
time_series(df2,'dt', 'AverageTemperatureUncertainty')

In [None]:
time_series(df3,'dt', 'LandAverageTemperatureUncertainty')

You can continue clean GlobalLandTemperaturesByCountry.csv and GlobalTemperatures.csv in our Rust app then check and visualize the result parquet files from above functions.