In [1]:
# Display plots directly in the notebook instead of in a new window
%matplotlib inline

# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import scipy.stats as stats
from scipy.stats import ttest_ind, chisquare, normaltest
import patsy

ModuleNotFoundError: No module named 'matplotlib'

In [None]:
# Loading data from datasets
df_capes = pd.read_csv("CapesCleanedEnrolled.csv")
df_income = pd.read_csv("income_dataset.csv")
df_capes

In [None]:
# Standardizing the strings in order to combine 
def last_name(string):
    end = string.find(',')
    return string[0:end].upper()

def first_name(string):
    start = string.find(',')
    name = string[start+2:].upper()
    space = name.find(' ')
    if (space != -1):
        return name[:space+2]
    return name

def uppercase(string):
    return string.upper()

def first_name_income(string):
    name = string.upper()
    space = name.find(' ')
    if (space != -1):
        return name[:space+2]
    return name

In [None]:
# Standardizing Capes Set
df_capes['Last Name'] = df_capes['Instructor'].apply(last_name)
df_capes['First Name'] = df_capes['Instructor'].apply(first_name)
df_capes['Instructor'] = df_capes['Last Name'] + ', ' + df_capes['First Name']

# Standardizing Income Set
df_income['First Name'] = df_income['First Name'].apply(first_name_income)
df_income['Instructor'] = df_income['Last Name'] + ', ' + df_income['First Name']

In [None]:
# Merge datasets and get rid of merge columns
df = df_income.merge(df_capes, on=['Instructor'])
df = df.drop(['Unnamed: 0_x', 'Last Name_x', 'First Name_x', 'Instructor', 'Unnamed: 0_y', 'Last Name_y', 'First Name_y'], axis=1)
df

# Data Analysis

In [None]:
df['Department'].value_counts().plot(kind='barh')

In [None]:
df = (df[(df['Department'] != 'other')])

# Determine Outliers for Department
lower, upper = np.percentile(df['Department'].value_counts(), [25, 75])
iqr = upper - lower
lower_cutoff = lower - 1.5 * iqr
upper_cutoff = upper + 1.5 * iqr
print("Lower 25: ", lower_cutoff, "\nUpper 25: ", upper_cutoff)

# To see which ones are outliers
df['Department'].value_counts()

# Get rid of the outliers
df = (df[(df['Department'] != 'Mathematics') & (df['Department'] != 'Biology')])
df

In [None]:
df['Department'].value_counts().plot(kind='barh')

In [None]:
plt.hist(df[df['Department'] =='Computer Science']['Regular Pay'])

In [None]:
plt.hist(df[df['Department'] =='Literature']['Regular Pay'])

In [None]:
stats.f_oneway((df[df['Department'] =='Cognitive Science']['Regular Pay']).values,(df[df['Department'] =='Chemistry']['Regular Pay']).values, (df[df['Department'] =='Computer Science']['Regular Pay']).values )

In [None]:
stats.f_oneway((df[df['Department'] =='Environmental Systems']['Regular Pay']).values,(df[df['Department'] =='Environmental Studies']['Regular Pay']).values, (df[df['Department'] =='Ethnic Studies']['Regular Pay']).values )

In [None]:
h_Chem = df[df['Department'] == 'Economics']['Regular Pay'].values
h_Env = df[df['Department'] == 'Visual Arts']['Regular Pay'].values
t_val, p_val = ttest_ind(h_Chem, h_Env)


In [None]:
p_val

In [None]:
t_val