## Ensemble Technique Project
### Github: https://github.com/cmelende/EnsembleTechniqueProject.git
### Cory Melendez
### 7/31/2022

In [None]:
import pandas as pd
import seaborn as sns
sns.set(rc={'figure.figsize':(11.7,8.27)})

In [84]:
bankData = pd.read_csv('bank-full.csv')
bankData.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,Target
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


### 1. Univariate Analysis

### a. 

### Description of indipendent variables

In [86]:

class UniVariateAnalysis:

    def __init__(self, df, columnName):
        self.columnName = columnName
        self.dataframe = df
        self.series = df[columnName]
    
    def get_q1(self):
        return self.series.quantile(.25)

    def get_q2(self):
        return self.series.quantile(.5)

    def get_q3(self):
        return self.series.quantile(.75)

    def get_q4(self):
        return self.series.quantile(1)

    def get_iqr(self):
        return self.get_q3() - self.get_q1()

    def get_min(self):
        return self.dataframe[self.columnName].min()

    def get_median(self):
        return self.dataframe[self.columnName].median()

    def get_max(self):
        return self.dataframe[self.columnName].max()

    def get_data_type(self):
        return self.dataframe[self.columnName].dtypes

    def get_lower_outlier_rows(self):
        return self.dataframe.loc[(self.dataframe[self.columnName] < self.get_lower_whisker_value())]

    def get_lower_whisker_value(self):
        return self.get_q1() - ((3/2) * self.get_iqr())

    def get_higher_outlier_rows(self):
        return self.dataframe.loc[(self.dataframe[self.columnName] > self.get_higher_whisker_value())]

    def get_higher_whisker_value(self):
        return self.get_q3() + ( (3/2) * self.get_iqr())

    def get_std(self):
        return self.dataframe[self.columnName].std()

In [85]:
class UniVariateReport: 
    def __init__(self, uniVariateAnalysis):
        self.analysis = uniVariateAnalysis
    
    def print_quartiles(self):
        print("Q1: " , self.analysis.get_q1())
        print("Q2: ", self.analysis.get_q2())
        print("Q3: ", self.analysis.get_q3())
        print("Q4: ", self.analysis.get_q4())
        print("Min: ", self.analysis.get_min())
        print("Median: ", self.analysis.get_median())
        print("Max: ", self.analysis.get_max())
    
    def print_whiskers(self):
        print("Top whisker: ", self.analysis.get_higher_whisker_value())
        print("Bottom whisker: ", self.analysis.get_lower_whisker_value())

    def print_data_type(self):
        print("Data type: ", self.analysis.get_data_type())

    def print_value_range(self):
        print(f'Range of values: ({self.analysis.get_min()}, {self.analysis.get_max()})')

    def print_std(self):
        print("Standard deviation: ", self.analysis.get_std())

    def print_report(self):
        self.print_data_type()
        self.print_value_range()
        self.print_std()
        self.print_quartiles()
        self.print_whiskers()

### Age
* meaning - Age of person, possibly a good indicator of target variables
* analysis - Distribution of the data seems to indicate that most of the people in the dataset are 'younger' to middle aged.
* missing values - No missing values, we can probably safely asssume that anyone over the age of 18 is a valid customer




In [83]:
ageAnalysis = UniVariateAnalysis(bankData, 'age')
agaAnalysisReport = UniVariateReport(ageAnalysis)
agaAnalysisReport.print_report()

Data type:  int64
Range of values: (18, 95)
Standard deviation:  10.61876204097542
Q1:  33.0
Q2:  39.0
Q3:  48.0
Q4:  95.0
Min:  18
Median:  39.0
Max:  95
Top whisker:  70.5
Bottom whisker:  10.5


In [80]:

ageAnalysis.get_higher_outlier_rows()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,Target
29158,83,retired,married,primary,no,425,no,no,telephone,2,feb,912,1,-1,0,unknown,no
29261,75,retired,divorced,primary,no,46,no,no,cellular,2,feb,294,1,-1,0,unknown,no
29263,75,retired,married,primary,no,3324,no,no,cellular,2,feb,149,1,-1,0,unknown,no
29322,83,retired,married,tertiary,no,6236,no,no,cellular,2,feb,283,2,-1,0,unknown,no
29865,75,retired,divorced,primary,no,3881,yes,no,cellular,4,feb,136,3,-1,0,unknown,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45163,71,retired,married,secondary,no,2064,no,no,cellular,9,nov,379,2,92,3,failure,no
45191,75,retired,divorced,tertiary,no,3810,yes,no,cellular,16,nov,262,1,183,1,failure,yes
45204,73,retired,married,secondary,no,2850,no,no,cellular,17,nov,300,1,40,8,failure,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes


In [None]:
bankData.describe()

In [None]:
sns.distplot(bankData['age'], bins=25)

In [None]:
sns.boxplot(bankData['age'])

In [None]:
bankData.loc[(bankData['age'] < 18)]

In [None]:
bankData.info()

job: 

marital:

education: 

default:

balance: 

housing: 

loan:

contact:

day: 

month:

duration:

campaign

pdays:

previous: 

poutcome:

target: assumed to be target variable, ie the variable we are wanting to predict