### Download the data and remove rows containing null values

In [1]:
# modules.data_module is a module we have created ourselves for useful data functions
import modules.data_module as dm
import pandas as pd

data_url = "https://corgis-edu.github.io/corgis/datasets/csv/graduates/graduates.csv"
data_path = "data/graduates.csv"

# Used to download and save for the first time
# dm.retrieve_data(data_url, save_to_file=True, save_path="data/graduates.csv")

data = dm.retrieve_data(data_path).set_index("Unnamed: 0")
data.index.name = "ID"

data

# Keep columns that are of interest.
#data = data.drop(["Salaries.Lowest", "Salaries.Highest"], axis=1)
data = data["Demographics.Total", "Salaries.Mean", "Education.Major", "Demographics.Ethnicity.Asians", "Demographics.Ethnicity.Minorities", "Demographics.Ethnicity.Whites", "Demographics.Gender.Females", "Demographics.Gender.Males"]

# Remove all rows where any column in the row has a value of 0
data = data[(data != 0).all(1)]

# Remove all rows where column not equal to 1993 as 1993 was before the Dotcom Bubble from 1995-2001.
# As we don't have more data before 1993, it wouldn't make sense to use that data as we don't know if the values
# were at peak, lower or the same before then.
data = (data[data['Year'] != 1993])
# The above causes decimals on Year. Let's get rid of those (so our plot doesnt show decimal years)
data['Year'] = data['Year'].astype(str).replace('\.0', '', regex=True)

# Remove all rows where Education.Major is not: 
# "Biological Sciences", "Chemical Engineering", "Chemistry", "Civil Engineering", "Computer Science and Math",
# "Economics", "Electrical Engineering", "Management & Administration", "Mechanical Engineering", "Other Engineering"
# "Physics and Astronomy" or "Psychology"
majors_of_interest = ["Biological Sciences", "Chemical Engineering", "Chemistry", "Civil Engineering", 
                      "Computer Science and Math", "Economics", "Electrical Engineering", "Management & Administration", 
                      "Mechanical Engineering", "Other Engineering","Physics and Astronomy", "Psychology"]
data = data.loc[data['Education.Major'].isin(majors_of_interest)]

data

KeyError: ('Demographics.Total', 'Salaries.Mean', 'Education.Major', 'Demographics.Ethnicity.Asians', 'Demographics.Ethnicity.Minorities', 'Demographics.Ethnicity.Whites', 'Demographics.Gender.Females', 'Demographics.Gender.Males')

### Plot Salaries.Mean for the majors of interest

In [None]:
# smp = salary mean plot
smp_data = data
smp_label = "Education.Major"
smp_x = "Year"
smp_y = "Salaries.Mean"
smp_xlabel = "Years"
smp_ylabel = "Salary Mean"
smp_labels = ["Electrical Engineering", "Physics and Astronomy", "Psychology", "Chemistry", "Economics", "Civil Engineering", "Physics and Astronomy", "Other Engineering"]
smp_title = "Salaries mean for given major between 1995 to 2015"

dm.plot_data(smp_data, smp_label, smp_x, smp_y, smp_xlabel, smp_ylabel, smp_labels, smp_title)

### Observations on the Salary graph for given majors between 1995 to 2015
We can see that certain educations are rising in salary since the start (1995) and up to the end year (2015), such as Electrical Engineering. It could indicate that it has been highly demanded throughout all the years, hence the salary increase.
On the other hand, the salary of the majority of the educations appears to be affected by financial crisis that begun in early 2007. Because of that, most of the educations had a high decrease in the salary during this period, which is also shown on the graph above. But after the financial crisis ending in 2009, the majority of the educations, such as Chemistry, Computer Science and Chemical Engineering are now at a higher salary level than before the crisis.

- Note til os: Lav nogle overordnede konklusioner når vi har de andre grafer med og få den røde tråd frem. Ovenstående er blot "åbenlyse" observationer.

### Plot Demographics.Ethnicity for the majors of interest

In [None]:
deap_data = data
deap_label = ["Education.Major", "Education.Major", "Education.Major"]
deap_x = ["Year", "Year", "Year"]
deap_y = ["Demographics.Ethnicity.Asians", "Demographics.Ethnicity.Whites", "Demographics.Ethnicity.Minorities"]
deap_xlabel = ["Year", "Year", "Year"]
deap_ylabel = ["Amount Of Majors", "Amount Of Majors",  "Amount Of Majors"]
deap_labels = majors_of_interest
deap_title = ["Amount of majors with demographic ethinicity of Asian for given major between 1995 to 2015",
              "Amount of majors with demographic ethinicity of Whites for given major between 1995 to 2015", 
              "Amount of majors with demographic ethinicity of Minorities for given major between 1995 to 2015"]

dm.plot_data(deap_data, deap_label, deap_x, deap_y, deap_xlabel, deap_ylabel, deap_labels, deap_title, 3)

In [None]:
# dgfp = demographics gender female plot
dgfp_data = data
dgfp_label = ["Education.Major", "Education.Major"]
dgfp_x = ["Year", "Year"]
dgfp_y = ["Demographics.Gender.Females", "Demographics.Gender.Males"]
dgfp_xlabel = ["Year", "Year"]
dgfp_ylabel = ["Amount Of Majors", "Amount Of Majors"]
dgfp_labels = majors_of_interest
dgfp_title = ["Amount of majors with demographic gender females for given major between 1995 to 2015", 
              "Amount of majors with demographic gender females for given major between 1995 to 2015"]

dm.plot_data(dgfp_data, dgfp_label, dgfp_x, dgfp_y, dgfp_xlabel, dgfp_ylabel, dgfp_labels, dgfp_title, 2)

In [None]:
# dgmp = demographics gender m plot
dgmp_data = data
dgmp_label = "Education.Major"
dgmp_x = "Year"
dgmp_y = "Demographics.Gender.Males"
dgmp_xlabel = "Years"
dgmp_ylabel = "Amount Of Majors"
dgmp_labels = majors_of_interest
dgmp_title = "Amount of majors with demographic gender males for given major between 1995 to 2015"

dm.plot_data(dgmp_data, dgmp_label, dgmp_x, dgmp_y, dgmp_xlabel, dgmp_ylabel, dgmp_labels, dgmp_title)

### Clustering of Data through Meanshift

In [None]:
from sklearn.cluster import MeanShift, estimate_bandwidth
import numpy as np

bandwidth = estimate_bandwidth(X=data.drop('Education.Major', axis=1), quantile=0.2)

ms = MeanShift(bandwidth=bandwidth)

ms.fit(data.drop('Education.Major', axis=1))
labels = ms.labels_
cluster_centers = ms.cluster_centers_

labels_unique = np.unique(labels)
n_clusters = len(labels_unique)

print('Number of estimated clusters : {}'.format(n_clusters)) 