In [0]:
from google.colab import drive
drive.mount('/content/drive')
file_path = '/content/drive/My Drive/Cocus' 
save_path = '/content/drive/My Drive/Cocus/save_files'

In [0]:
import glob, os
import numpy as np
import pandas as pd
import pandas_profiling
from functools import reduce
import matplotlib.pyplot as plt

**Reading and combine all CSV Files  as a single DF using Pandas**

In [0]:
# Read All files
all_df = [pd.read_csv(i) for i in glob.glob(os.path.join(file_path, "*.csv"))]

# Merge based on Country column
all_df = reduce(lambda df1,df2: pd.merge(df1,df2,on='country'), all_df)

# Remove Unnamed Column Name
all_df = all_df.loc[:, ~all_df.columns.str.contains('Unnamed')]


**Read and combine CSV files as a seperate DF with corresponding years using pandas**

In [0]:
dfs = [{i[-8: -4]: pd.read_csv(i)} if i[-8 : -4] != "ther" else {"weather": pd.read_csv(i)} for i in glob.glob(os.path.join(file_path, "*.csv"))]

# Year 2011
all_df_2011 = [dfs[i].get("2011") for i in range(len(dfs)) if list(dfs[i].keys())[0] == "2011"][0]

# Year 2013
df_2013 = [dfs[i].get("2013") for i in range(len(dfs)) if list(dfs[i].keys())[0] == "2013"] 
all_df_2013 = reduce(lambda df1,df2: pd.merge(df1,df2,on='country'), df_2013)

# Year 2015
all_df_2015 = [dfs[i].get("2015") for i in range(len(dfs)) if list(dfs[i].keys())[0] == "2015"][0]

# Year 2016
df_2016 = [dfs[i].get("2016") for i in range(len(dfs)) if list(dfs[i].keys())[0] == "2016"] 
all_df_2016 = reduce(lambda df1,df2: pd.merge(df1,df2,on='country'), df_2016)

# Weather
all_df_weather = [dfs[i].get("weather") for i in range(len(dfs)) if list(dfs[i].keys())[0] == "weather"][0] 
all_df_weather = all_df_weather.loc[:, ~all_df_weather.columns.str.contains('Unnamed')]


**Exploratory Data Analysis**

**Step 1 : How the data looks like ?**

---



In [19]:
# Shape 
print(f"The Data Frame contains {all_df.shape[0]} rows and {all_df.shape[1]} columns \n")

# Unique countries
print(f"In total there were {all_df.country.nunique()} countries provided along with their socio-economic indices \n")

# column names
print("The column names were : \n", all_df.columns)


The Data Frame contains 32 rows and 42 columns 

In total there were 32 countries provided along with their socio-economic indices 

The column names were : 
 Index(['country', 'prct_low_savings', 'police_trust_rating',
       'prct_budget_veryhard', 'prct_budget_hard', 'prct_budget_somehard',
       'prct_budget_someeasy', 'prct_budget_easy', 'prct_budget_veryeasy',
       'prct_health_verygood', 'prct_health_good', 'prct_health_fair',
       'prct_health_bad', 'prct_health_verybad', 'legal_trust_rating',
       'prct_rpt_pollution', 'med_income_underemp', 'median_income',
       'total_pop', 'prct_yng_adt_pop', 'political_trust_rating', 'unemp_rate',
       'prct_env_satis_high', 'prct_env_satis_med', 'prct_env_satis_low',
       'gdp', 'life_expect', 'prct_close_relat', 'prct_leisure_satis_high',
       'prct_leisure_satis_med', 'prct_leisure_satis_low', 'avg_temp',
       'avg_high_temp', 'avg_low_temp', 'avg_precipitation', 'prct_rpt_crime',
       'prct_job_satis_high', 'prct_job

**Step 2 : Understand the Data using pandas profiling**

---


- Instead of using df.describe() & df.info() use Pandas Profiling to get general Idea about the data set.
- Pandas Profiling performs quick data analysis.
- It performs Histogram, Correlation, Each feature/variable Information,Missing Values, Statistics etc about the dataset
- Link : [pandas_profiling_documentation](https://pandas-profiling.github.io/pandas-profiling/docs/#documentation)
- Note : Convinient to use for small dataset & Can be saved as a HTML File

In [12]:
## Understand the Data by calling profile_report (Interface)
all_df.profile_report(style={'full_width':True})

## Saving as HTML Report
# profile = all_df.profile_report(title='Pandas Profiling Report')
# profile.to_file(save_path + "Europe data profiling.html") # save location



Points Learnt

```
# This is formatted as code
```



**Step 2 : Do some visualization to understand better about the data**

---
1. 


In [0]:
## Loading the required Libraries for visualization

# Standard plotly imports
import plotly.offline as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
import plotly.figure_factory as ff

# Using plotly + cufflinks in offline mode
import cufflinks
cufflinks.go_offline(connected=True)
init_notebook_mode(connected=True)

# Change the renders, if you use colab
import plotly.io as pio
pio.renderers.default = "colab"

In [22]:
all_df.iplot(kind="scatter",  x='gdp', y='life_expect', categories='country', xTitle='GDP',yTitle='Life_Expectancy',title='GDP Vs Life Expectancy')


The above scatter plot shows
*   Germany 
*   List item



In [0]:
all_df.iplot(kind="bar",  x='unemp_rate', y='med_income_underemp', categories='country', xTitle='GDP',yTitle='Life_Expectancy',title='GDP Vs Life Expectancy')

In [14]:
# Test 
# all_df[["unemp_rate", "med_income_underemp"]]
corr_test = all_df[['prct_job_satis_high', 'prct_job_satis_med', 'prct_job_satis_low',
       'prct_life_satis_high', 'prct_life_satis_med', 'prct_life_satis_low','prct_leisure_satis_high',
       'prct_leisure_satis_med', 'prct_leisure_satis_low']]#,  'prct_env_satis_high', 'prct_env_satis_med', 'prct_env_satis_low']]

corrs = corr_test.corr()
fig = ff.create_annotated_heatmap(
    z=corrs.values,
    x=list(corrs.columns),
    y=list(corrs.index),
    annotation_text=corrs.round(2).values,
    showscale=True)

# Make text size smaller
for i in range(len(fig.layout.annotations)):
    fig.layout.annotations[i].font.size =10
fig.show()

In [0]:
df = all_df[['prct_job_satis_high',  'prct_life_satis_high', 'prct_leisure_satis_high',  'prct_env_satis_high', 'prct_job_satis_med',  
             'prct_life_satis_med', 'prct_leisure_satis_med', 'prct_env_satis_med',  'prct_job_satis_low', 'prct_life_satis_low',
             'prct_leisure_satis_low', 'prct_env_satis_low']]

In [25]:
for i,j in zip([0,4,8], [4,8,12]):
  corrs = df.iloc[:, i : j].corr()
  fig = ff.create_annotated_heatmap(
      z=corrs.values,
      x=list(corrs.columns),
      y=list(corrs.index),
      annotation_text=corrs.round(2).values,
      showscale=True)

  # Make text size smaller
  for i in range(len(fig.layout.annotations)):
      fig.layout.annotations[i].font.size =10
  fig.show()
  print("----------------------------------------------------------------------------------------------------------------------")

----------------------------------------------------------------------------------------------------------------------


----------------------------------------------------------------------------------------------------------------------


----------------------------------------------------------------------------------------------------------------------


In [18]:
df.iloc[:, :3]

Unnamed: 0,prct_job_satis_high,prct_life_satis_high,prct_leisure_satis_high
0,42.2,37.9,35.3
1,23.0,20.9,21.5
2,16.1,5.9,14.6
3,36.6,38.5,25.8
4,28.2,14.2,24.0
5,29.6,21.3,24.3
6,25.0,25.0,24.2
7,44.4,42.7,41.4
8,26.6,13.5,22.2
9,14.0,12.8,11.6


In [5]:
all_df.columns

Index(['country', 'prct_low_savings', 'police_trust_rating',
       'prct_budget_veryhard', 'prct_budget_hard', 'prct_budget_somehard',
       'prct_budget_someeasy', 'prct_budget_easy', 'prct_budget_veryeasy',
       'prct_health_verygood', 'prct_health_good', 'prct_health_fair',
       'prct_health_bad', 'prct_health_verybad', 'legal_trust_rating',
       'prct_rpt_pollution', 'med_income_underemp', 'median_income',
       'total_pop', 'prct_yng_adt_pop', 'political_trust_rating', 'unemp_rate',
       'prct_env_satis_high', 'prct_env_satis_med', 'prct_env_satis_low',
       'gdp', 'life_expect', 'prct_close_relat', 'prct_leisure_satis_high',
       'prct_leisure_satis_med', 'prct_leisure_satis_low', 'avg_temp',
       'avg_high_temp', 'avg_low_temp', 'avg_precipitation', 'prct_rpt_crime',
       'prct_job_satis_high', 'prct_job_satis_med', 'prct_job_satis_low',
       'prct_life_satis_high', 'prct_life_satis_med', 'prct_life_satis_low'],
      dtype='object')

# Kmeans

In [0]:
X = all_df.iloc[:,1:]

# Using the elbow method to find the optimal number of clusters
from sklearn.cluster import KMeans
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
plt.figure(figsize=(12,6))
plt.plot(range(1, 11), wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()


In [0]:
# Fitting K-Means to the dataset
kmeans = KMeans(n_clusters = 3, init = 'k-means++', random_state = 42)
y_kmeans = kmeans.fit_predict(X)

In [8]:
# Fitting K-Means to the dataset
kmeans = KMeans(n_clusters = 3, init = 'k-means++', random_state = 42)
y_kmeans = kmeans.fit(X)
type(y_kmeans.cluster_centers_)
kmeans.labels_

array([1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 2, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 2, 1, 1, 1, 1, 1, 1, 0], dtype=int32)

In [0]:
y_kmeans = pd.DataFrame(y_kmeans, columns=["cluster_no"])

In [0]:
result_df = pd.concat((all_df.country, y_kmeans), axis=1)
result_df = result_df.sort_values(by="cluster_no")
result_df