# A) Data Cleaning

In [None]:
from Project.Data_Cleaning import clean_data
from Project.GDP_Pop_Extraction.extraction import extract_data

df = extract_data()

#This operation takes about 1.5 minutes on an average laptop, the modified dataset
#can be read from the main directory if we are not focusing on this part
data = clean_data.CleanData(df) # Use cd.CleanData(log_level = 0) to see logging messages
#The main source of the wait time is __decimal_fix in the init function
modified = data.modified
NaN = data.NaN
NoNaN = data.NoNaN
feature_tables = data.feature_tables

#To create csvs from the results of data cleaning
#data.create_csvs()

In [None]:
# A sample slice of the dataset
#To bypass the wait time above, comment the upper part, uncomment the lines below
#****WARNING***** The PCA and map_display functions requires the CleanData object above
import pandas as pd
modified = pd.read_csv('output/modified.csv')
modified = modified.drop('Unnamed: 0',axis = 1)

In [None]:
print('\nBelow there is a sample from the modified dataset')
modified

# B) Principal Component Analysis

In [None]:
from Project.PCA import pca_analysis

result_pca = pca_analysis.PCA_Analysis(data)

eig_vals = result_pca.eig_vals
Nmin = pca_analysis.calnum(eig_vals)
print(f'Minimum number of eigen values for the subsace to provide enough (85%) information: {Nmin}\n')

x = result_pca.x
finalDf = result_pca.finalDf
features = result_pca.features

pcoef, finalcomp = pca_analysis.calcoefficient(x,finalDf,features)

print('The 5 different feature combinations obtained from PCA are shown below.')
print('In each combination there are 4 positive and 4 negative components.\n')
for index,i in enumerate(finalcomp):
    print(f'{index+1} - ', end =" ")
    for j in i:
        print(j,end =", ")
    print('')
    
print('\nWe will use these results to determine the most relevant features.')

# C) Pre Visualization

In [None]:
from Project.Map_Display import map_display
from IPython.display import SVG

features = list(modified.copy().columns)
features.remove('Country')
features.remove('Year')
print(features)

map_display.displaymap(data,features[1], 2000)

In [None]:
def createBar():
    """
    use matplotlib to create a color bar to show on the map.
    Need to paste externally.
    """
    from matplotlib import pyplot as plt
    import numpy as np

    maxValue=max(modified['Life expectancy '])
    minValue=min(modified['Life expectancy '])
    n = 10
    x = np.linspace(minValue,maxValue,100)
    y = np.linspace(minValue,maxValue,100)
    X, Y = np.meshgrid(x, y)
    plt.imshow(f(X, Y), cmap='RdYlGn', origin='low')
    plt.colorbar()
     
    plt.xticks(())
    plt.yticks(())
    plt.show()
    
def f(x, y):
    return x

In [None]:
map_display.displaymap(data,features[1], 2015)

In [None]:
#map_display.display(data,features[3], 2013)
#map_display.display(data,features[4], 2012)

In [None]:
from Project.Map_Display.analysis_part1 import analysis

#The below function prints raw output, which is the first step of our analysis
analysis(modified.copy())

In [None]:
from Project.Map_Display.analysis_part1 import sortdata
from Project.Map_Display.analysis_part1 import showScatter
from Project.Map_Display.analysis_part1 import showBar_alt
import altair as alt
alt.renderers.enable('notebook')

# D) Visualization Part 1 (Differences between countries)

In [None]:
#below shows the plots we present for countries with best and worst life expectancy.
showScatter(modified.copy(),'GDP',' BMI ',height=400,width=400,year=2015,title="GDP vs Life expectancy")

In [None]:
#plot the comparison of life expectancy between top10 and bottom 10 countries in 2000. 
showBar_alt(modified.copy(),height=600,width=600,title='Countries with highest and lowest life expectancy')

In [None]:
#plot Diphtheria, Polio and life expectancy
showScatter(modified.copy(),'Hepatitis B','Polio',nums=20,year=2015,height=400,width=400)

In [None]:
#plot infant death, adult mortality and life expectancy
showScatter(modified.copy(),'infant deaths','Adult Mortality',nums=20,year=2015)
# The adult mortality rate shown in the World Development Indicators (WDI) 
# database and related products refers to the probability that those who have reached age 
# 15 will die before reaching age 60 (shown per 1,000 persons).

#Infant deaths means the probability that those who die before 12month, per 1,000 persons. From WHO

In [None]:
#plot GDP/capita and BMI for all countries
modified_temp = modified.copy()
alt.Chart(modified_temp[modified_temp['Year']==2015]).mark_circle().encode(
    alt.X(' BMI ', scale=alt.Scale(zero=False)),
    alt.Y('Life expectancy ', scale=alt.Scale(zero=False, padding=1)),
    #color='Country',
    size='GDP'
).properties(
    height=600,
    width=800
)

# E) Visualization Part 2 (Most relevant features)

In [None]:
from Project.Relevance_Analysis import integrate_relevance
from Project.Relevance_Analysis import integrate_year
from Project.Relevance_Analysis import rate_highlight
from Project.Relevance_Analysis import wordcloud

#Modified should be already defined above

In [None]:
integrate_year.visual_allfeaturechaning(modified.copy())

In [None]:
rate_highlight.visual_highlightrate(modified.copy())

In [None]:
#TODO integrate relevance


In [None]:
text = 'GDP,HIV,Population, Schooling, Mortality,infant,Alcohol,expenditure%,Measles,BMI,15belowdeaths,Polio,ExpenditureTotal,Diphtheria,IncomeResources,GDP,YoungThinness'

wordcloud.visual_wordcloud(text)

# F) Visualization Part 3 (Change through the years)

In [None]:
from Project.Change_Analysis import change_analysis

# Generates the necessary graphs for the 3rd part of our Visualization
change_analysis.generate_results(modified.copy())