### Required Packages

#### Table of Contents

    1.0 Loading the Dataset
    1.1 Working without Duplicates
        1.1.1 Selecting Distinct Observations by removing duplicates.
        1.1.2 Counting distinct records.
        1.1.3 Selecting Top 10.
        1.1.4 Selecting with multiple conditions.
        1.1.5 Selecting only the required columns.
    1.2 Using Wild Cards
        1.2.1 Containing
        1.2.2 Like
        1.2.3 Startwith
        1.2.4 Endwith
    1.3 Tables Joinings
        1.3.1 Inner Join
        1.3.2 Left Join
        1.3.3 Right Join
    1.4 Sort Records
        1.4.1. 
    1.5 Case When 
    1.6 Converting a table from Flat to Wide uisng Case.


### 1.0 Loading the Datasets

In [None]:
import pandas as pd
import numpy as np
from pandasql import sqldf
import json
import os
from sklearn.datasets import fetch_california_housing
pd.options.display.float_format = '{:.2f}'.format

In [None]:
# Create a function to glimpse the data
def glimpse(df):
    print(f"{df.shape[0]} rows and {df.shape[1]} columns")
    display(df.head())
    display(df.tail())
    display(df.columns.values.tolist())

#### Datasets for Use

In [None]:
# Load data (will download the data if it's the first time loading)
housing = fetch_california_housing(as_frame=True)
# Create a dataframe
df = housing['data'].join(housing['target'])
glimpse(df)

In [None]:
df_2 = pd.read_csv("https://media.geeksforgeeks.org/wp-content/uploads/nba.csv")

# dropping null value columns to avoid errors
df_2.dropna(inplace = True)

glimpse(df_2)

In [None]:
url = 'http://universities.hipolabs.com/search?'
# Load the first sheet of the JSON file into a data frame
df_3 = pd.read_json(url, orient='columns')

glimpse(df_3)

In [None]:
#select * from df_3 where upper(substring(name,2,2)) = 'ER'  
# substring to be searched
sub ='er'
 
# start var
start = 2
 
# creating and passing series to new column
df_3["Indexes"]= df_3["name"].str.find(sub, start)
 
# display
df_3.head()

### Using Pandas Profiling to have a full view of the dataset

In [None]:
#pip install -U pandas_profiling
#pip install -U scipy
import pandas_profiling as pp
pp.ProfileReport(df_3)

### 1.1 Working without duplicates

#### 1.1.1 selecting distinct observation from the record

In [None]:
#select distinct country, alpha_two_code, name from df_2
dis_out = df_3[['country','alpha_two_code','name']].drop_duplicates()
dis_out.head()

In [None]:
t1 = df_3[df_3['country']=='United States']
t1.name.value_counts()

In [None]:
df_3[(df_3['country']=='United States') & (df_3['name']=='Highland Community College') ]

#### 1.1.2 Counting distinct records

In [None]:
#select country, count(country) from df_dist order by count(country)
df_3['country'].value_counts()

In [None]:
out = df_3[['country','alpha_two_code','name']].drop_duplicates()
out['country'].value_counts()

#### 1.1.3 Selecting Top 20 countries by number of Universities

In [None]:
#select country, count(country) as counts from df_dist order by counts desc limit 10
df_3['country'].value_counts().rename_axis('country').reset_index(name='counts').head(10)

#### 1.1.4 selecting a variable and calculating the mean

In [None]:
#select Age, avg(Discount) as AvgLoanAmount from df group by Age order by AvgLoanAMount desc Limit 10;
df_2[['Team','Salary']].groupby('Team').mean().dropna().reset_index().head(10)

In [None]:
#select max(Discount) from df where age = 20
df[df.Age >= 20]['Discount'].max()

#### 1.3.1 Inner Join using a Key

In [None]:
df_inner = pd.merge(left=df, right=df_1, left_on='ClientID', right_on='ClientID')

In [None]:
df_inner.head()

In [None]:
list(df_inner)

#### *** Selecting only the required columns

In [None]:
df_inner_1 = df_inner[['ClientID','LoanDate','LoanAmount','Term','LoanSeries','LoanPeriodID','RepaymentTerm','InterestAmount',
 'Age','year_with_the_bank','default','predicted_scores','Defaulted_0','Defaulted_1']]

In [None]:
df_inner_1.head()

In [None]:
df_inner_1.to_csv('C:\\Users\\seune\\desktop\\base_scoring_1.csv',index=False)

#### *** Selecting with multiple conditions.

In [None]:
#df_sql = np.where((df['LoanDate']>='8/8/2017') & (df['LoanAmount'] > 50000) & (df['FT_Team'].str.startswith('S')))

#Select * from df where LoanDate >= '8/8/2017' and LoanAmount > 50000 and Age > 30
df_sql = np.where((df['LoanDate'] >= '8/8/2017') & (df['LoanAmount'] > 50000) & (df['Age'] > 50))


In [None]:
df_sql_1 = df.loc[df_sql]
df_sql_1.head()

In [None]:
df_sql_1.shape

In [None]:
# sorting dataframe
df_3.sort_values("Team", inplace = True)

df_3.dropna(inplace = True)
  
# making boolean series for a team name
filter1 = df_3["Team"]=="Atlanta Hawks"
  
# making boolean series for age
filter2 = df_3["Age"]>=27
  
# filtering data on basis of both filters
df_3.where(filter1 & filter2, inplace = True)

#.dropna(inplace = True)
  
# display
df_3

In [None]:
#select * from df_3 where Team = 'Atlanta Hawks'

# sorting dataframe
df_3.sort_values("Team", inplace = True)
  
# making boolean series for a team name
filter = df_3["Team"]=="Atlanta Hawks"
  
# filtering data
df_3.where(filter, inplace = True)
  
# display
df_3

#### *** Selecting only records with Wildcard 

In [None]:
df_distinct.head()

#### *** Where a strings starts with U

In [None]:
#select all record from the dataframe where name like University
df_json_sql = np.where((df_distinct['alpha_two_code'] == 'US') & (df_distinct['name'].str.startswith('U')))

In [None]:
df_json_sql_1 = df_distinct.loc[df_json_sql]
df_json_sql_1.head()

In [None]:
df_json_sql_1.shape

In [None]:
#select all record from the dataframe where name ends with 'Uni'
df_json1 = np.where((df_distinct['alpha_two_code'] == 'US') & (df_distinct['name'].str.endswith('Uni')))
df_json_1 = df_distinct.loc[df_json1]

#### *** Where a string contains 'Uni' and 'Was'

In [None]:
#select all record from the dataframe where name like Uni and name like Was
df_json_s = np.where((df_json['name'].str.contains('Uni')) & (df_json['name'].str.contains('Was')))

In [None]:
df_json_s_1 = df_json.loc[df_json_s]
df_json_s_1.head()

In [None]:
df_json_s_a = df_json[df_json['name'].str.contains('Uni') & df_json['name'].str.contains('Was')]
df_json_s_a.head()

#### *** Select Country, alpha_two_code and count distinct Universities in each of the countries 

In [None]:
#df_json.loc['country','alpha_two_code']

df_json[['country','alpha_two_code']].agg(['nunique','count','size'])


#.agg(['nunique','count','size'])

In [None]:
df_json.groupby(['country','alpha_two_code']).agg(['count', 'nunique']).stack()

### ****Case when 

In [None]:
def myfunc(Start_Date,End_Date):
    Full_Year_Goal = None
    if (Start_Date == '2014-07-27' or Start_Date == '2014-07-27') and End_Date == '2015-07-25':
        Full_Year_Goal = 'Y'
    elif (Start_Date == '2015-07-26' or Start_Date == '2015-07-27') and End_Date == '2016-07-30':
        Full_Year_Goal = 'Y'
    elif (Start_Date == '2016-07-31' and End_Date == '2017-07-29'):
        Full_Year_Goal = 'Y'
    elif (Start_Date == '2017-07-30' or Start_Date == '2017-07-31') and End_Date == '2018-07-28':
        Full_Year_Goal = 'Y'
    else:
        Full_Year_Goal = 'N'
    return Full_Year_Goal