Importing libraries and data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

data = pd.read_csv('Lecture3.csv')
data=pd.read_csv(r'/Users/dennistay/Desktop/Lecture3.csv')


Setting display options

In [None]:
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',10)
pd.set_option('display.width', 1000)

Setting image properties and visual theme





In [None]:
from pylab import rcParams

#set image size and resolution (e.g. to meet publisher requirements)
rcParams['figure.figsize']= 6.4, 4.8  #width, height in inches (default=6.4, 4.8)
rcParams['figure.dpi']=100   # dpi (default=100)

#seaborn customizations. set_theme is the universal control, the rest are specific sub controls
#CONTEXT options: notebook, paper, talk, poster [affects font sizes]
#STYLE options: darkgrid , whitegrid , dark , white , ticks
#PALETTE options: pastel, muted, bright, deep, dark, colorblind (and many more)
sns.set_theme(context='notebook', style='darkgrid', palette='deep', font='arial', font_scale=1)

sns.scatterplot(data, x='Happiness', y='GDP_pc', hue='Continent')

#try defining your own palette
my_colors = ["orange", "pink", "red", 'black', 'gold']

sns.scatterplot(data, x='Happiness', y='GDP_pc', hue='Continent', palette=my_colors)

#to reset theme
sns.set()


Histogram

In [None]:
sns.histplot(data=data,x='GDP_pc',color='red',bins='auto')
plt.title("GDP per capita", fontsize=12)

#comparing mean vs. median gives us a good indication of skew
np.mean(data['GDP_pc'])
np.median(data['GDP_pc'])
np.percentile(data['GDP_pc'], 50)
data['GDP_pc'].describe()

#Rice's rule (optimal no. of bins = cube root of no. of data points x 2)
optimal=(len(data)) ** 0.33  * 2
sns.histplot(data=data,x='GDP_pc',color='purple',bins=round(optimal))


ECDF

In [None]:
#using sns (the easy way)
sns.ecdfplot(data,x='GDP_pc',color='red')

#Verify reading from ECDF
np.percentile(data.GDP_pc,80)

# Writing a Python function to plot ECDF
def ecdf(input_data):
    # Number of data points: n
    n = len(input_data)
    # sort data in increasing order for the ECDF: x
    x = np.sort(input_data)
    # y-axis is the cumulative percentages for the ECDF: y
    y= np.arange(1,n+1) / n
    return x, y

# Calling the function to compute ECDF for a particular data column
x, y = ecdf(data['GDP_pc'])

# Generate plot
plt.plot(x, y, marker='.', linestyle='none',c='black')   #try 'solid', 'dotted', 'dashed'
plt.xlabel('Pop')
plt.ylabel('ECDF')
plt.title('Whatever you want')


#Plot Asia vs. Africa GDP
AsiaGDP = data.loc[data['Continent']=='ASIAP', 'GDP_pc']
AfricaGDP =data.loc[data['Continent']=='AFR', 'GDP_pc']

x1, y1 = ecdf(AsiaGDP)
x2, y2 = ecdf(AfricaGDP)

# Generate plot
plt.plot(x1,y1,marker='*', linestyle='none',c='purple')
plt.plot(x2,y2,marker='.', linestyle='none',c='green')
# Label the axes
plt.xlabel('GDP per capita')
plt.ylabel('ECDF')
plt.legend(['Asia','Africa'],loc='best')


Count plot

In [None]:
sns.countplot(data, x='HDI')
plt.title('Number of countries in HDI categories')
plt.xlabel('HDI category')
plt.ylabel('Number')
plt.yticks(rotation=35)


sns.countplot(data,x='HDI',palette=['green','red','orange','purple'])

#A more complex count plot
sns.countplot(data,x='HDI',hue='Continent',order=['LOW','MEDIUM','HIGH','V_HIGH'])


Bar plot

In [None]:
sns.barplot(data,x='Continent',y='Happiness')

sns.barplot(data,x='Continent',y='Happiness',ci=None)

#if we don't set the order for the x-axis, how is the default order determined?
sns.barplot(data,x='HDI',y='Happiness',hue='Continent')
sns.barplot(data,x='HDI',y='Happiness',hue='Continent',order=['LOW','MEDIUM','HIGH','V_HIGH'])

#catplot allows us to plot other categories along columns and rows instead
sns.catplot(data, x='HDI', y='Happiness', col='Continent', kind='bar')

"""
Useful variants to barplot: strip plot and box plot
"""
sns.barplot(data,x='HDI',y='Happiness')
sns.stripplot(data,x='HDI',y='Happiness', color='black')
sns.boxplot(data,x='HDI',y='Happiness')


Scatterplot

In [None]:
sns.scatterplot(data,x='Happiness',y='GDP_pc')

data[['Happiness','GDP_pc']].corr()


sns.scatterplot(data,x='Happiness',y='GDP_pc',hue='Continent')
sns.scatterplot(data,x='Life_exp',y='GDP_pc',size='Pop(m)')
sns.scatterplot(data,x='Life_exp',y='GDP_pc',size='Pop(m)',sizes=(1, 100)) #control min and max size
sns.scatterplot(data,x='Happiness',y='GDP_pc',hue='HDI',hue_order=['LOW','MEDIUM','HIGH','V_HIGH'])

#relplot allows us to plot other categories along columns and rows instead
sns.relplot(data,x='Happiness',y='GDP_pc', col='Continent', kind='scatter')

#lmplot includes a regression line
sns.lmplot(data,x='Happiness',y='GDP_pc')


#axis lines to demarcate 'quadrants'
sns.scatterplot(data,x='Happiness',y='GDP_pc',hue='HDI',hue_order=['LOW','MEDIUM','HIGH','V_HIGH'])
plt.axhline(y=np.mean(data['GDP_pc']), color='black', linestyle='--', linewidth=1)
plt.axvline(x=np.mean(data['Happiness']), color='black', linestyle='--', linewidth=1)



Time series plot

In [None]:
#https://pandas.pydata.org/pandas-docs/dev/getting_started/intro_tutorials/09_timeseries.html

data2 = pd.read_csv('Covid.csv',index_col=['date'])

#convert index to datetime index
data2.index = pd.to_datetime(data2.index)

#with datetime index, conveniently access certain date ranges of interest
data2['total_cases'].plot()
data2.loc['2021-06':'2022-01']['total_cases'].plot()


#resampling
data2['total_cases'].resample('M').mean()  #resample by month
data2['total_cases'].resample('M').mean().plot()  #plot month means

data2['total_cases'].resample('Y').mean()  #resample by year
data2['total_cases'].resample('Y').mean().plot()    #plot yearly means




Optional: using matplotlib

In [None]:
#set styles (various styles available, check online)
plt.style.use('dark_background')

#Histogram
plt.hist(data.GDP_pc, color='orange', bins=10)

#Count plots
# Extract values and labels
sizes = data['HDI'].value_counts()
labels = data['HDI'].value_counts().index
# Some variants
plt.pie(sizes, labels=labels)
plt.bar(labels, sizes, color=['g','r','b','black'])

#Bar plot
plt.bar(data['Continent'],data['Life_exp'], color=['white'])

#Scatter plot
plt.scatter(data['Happiness'],data['Life_exp'], color=['pink'])

#Time series plot
plt.plot(data2['total_cases'], color='r')
plt.xticks(rotation=60)


SEMINAR 3

In [None]:
data=pd.read_csv('Seminar3.csv')  #we don't need names as index this week

sns.histplot(data, x='Test1', bins=round(len(data) ** .33 * 2))
sns.histplot(data, x='Test2')

sns.ecdfplot(data,x ='WorkExp')
sns.ecdfplot(data,x ='Average')

sns.ecdfplot(data,x ='Average', hue='Gender')


sns.barplot(data,y='Average', x='Background', hue='Gender', palette=['gold','silver'])
plt.legend(loc='upper right')
plt.title('Average scores by background and gender')


sns.stripplot(data,y='Average', x='Background', hue='Gender', palette=['gold','silver'])

sns.boxplot(data,y='Average', x='Background', hue='Gender', palette=['gold','silver'])

sns.scatterplot(data, y='Test2', x='Test1')
sns.lmplot(data, y='Test2', x='Test1')



sns.pairplot(data)
sns.jointplot(data, x='Test1',y='Test2')