## Importing Pandas

In [1]:
#importing the pandas library
import pandas as pd

In [7]:
#using the from_dict method to convert a dictionary into a Pandas dataframe
import random
random.seed(3)
names = ["Jess","Jordon","Sandy","Ted","Barney","Tyler","Rebecca"]
ages = [ random.randint(18,35) for x in range(len(names))]
people = { "names" : names, "ages" : ages}
df = pd.DataFrame.from_dict(people)
print(df)

     names  ages
0     Jess    25
1   Jordon    35
2    Sandy    22
3      Ted    29
4   Barney    33
5    Tyler    20
6  Rebecca    18


## Accessing data

## Indexing by column

In [15]:
#directly selecting a column in pandas
print(df["ages"])
print(df["ages"][3])

0    25
1    35
2    22
3    29
4    33
5    20
6    18
Name: ages, dtype: int64
29


## Indexing by Record

In [12]:
#directly selecting a record in Pandas using .loc
print(df.loc[0])
print(df.loc[0]["names"])

names    Jess
ages       25
Name: 0, dtype: object
Jess


## Slicing a Dataframe

In [16]:
#slicing a Dataframe to grab specific records
print(df[2:5])

    names  ages
2   Sandy    22
3     Ted    29
4  Barney    33


## Builtin Methods

In [17]:
#accessing the top 5 records using head
df.head(5)

Unnamed: 0,names,ages
0,Jess,25
1,Jordon,35
2,Sandy,22
3,Ted,29
4,Barney,33


In [18]:
#accessing bottom 3 records using tail
df.tail(3)

Unnamed: 0,names,ages
4,Barney,33
5,Tyler,20
6,Rebecca,18


In [21]:
#accessing the column headers using the keys method
header = df.keys()
print(header)

Index(['names', 'ages'], dtype='object')


In [24]:
#checking the shape which is the number of records and columns
print(df.shape)

(7, 2)


In [26]:
#checking the general statistics of the Dataframe using describe, only works with numerical columns
df.describe()

Unnamed: 0,ages
count,7.0
mean,26.0
std,6.531973
min,18.0
25%,21.0
50%,25.0
75%,31.0
max,35.0


In [27]:
#sort based on a given column , but keep the dataframe intact using the sort_values 
df = df.sort_values("ages")
df.head(5)

Unnamed: 0,names,ages
6,Rebecca,18
5,Tyler,20
2,Sandy,22
0,Jess,25
3,Ted,29


## Filtration

## Conditionals

In [28]:
#using a conditional to create a true/false column to work with 
can_drink = df["ages"] > 21
print(can_drink)

6    False
5    False
2     True
0     True
3     True
4     True
1     True
Name: ages, dtype: bool


## Subsetting

In [29]:
#using subsetting to filter out records and keep Dataframe intact
df[df["ages"] > 21 ]

Unnamed: 0,names,ages
2,Sandy,22
0,Jess,25
3,Ted,29
4,Barney,33
1,Jordon,35


## Column Transformations

## Generating a New Column with Data

In [31]:
#generating a new column of fake data for each record in the Dataframe to represent customer data
random.seed(311)
tenure = [ random.randint(0,10) for x in range(len(df))]
df["tenure"] = tenure
df.head()

Unnamed: 0,names,ages,tenure
6,Rebecca,18,4
5,Tyler,20,4
2,Sandy,22,1
0,Jess,25,6
3,Ted,29,7


## Feature engineering using apply

In [34]:
#feature engineering a new column from known data using UDF
def ageGroup(age):
    return "Teenager" if age < 21 else "Adult"
df["age_group"] = df["ages"].apply(ageGroup)
df.head()

Unnamed: 0,names,ages,tenure,age_group
6,Rebecca,18,4,Teenager
5,Tyler,20,4,Teenager
2,Sandy,22,1,Adult
0,Jess,25,6,Adult
3,Ted,29,7,Adult


## Aggregations

In [35]:
#grouping the records together to count how many records in each group
df.groupby("age_group",as_index=False).count()

Unnamed: 0,age_group,names,ages,tenure
0,Adult,5,5,5
1,Teenager,2,2,2


In [38]:
#grouping the data to see the averages of all columns
df.groupby("age_group",as_index=False).mean().head()

Unnamed: 0,age_group,ages,tenure
0,Adult,28.8,4.0
1,Teenager,19.0,4.0


In [39]:
#grouping information by their age group, then their tenure
df.groupby(["age_group","tenure"],as_index=False).count().head()

Unnamed: 0,age_group,tenure,names,ages
0,Adult,1,1,1
1,Adult,2,1,1
2,Adult,4,1,1
3,Adult,6,1,1
4,Adult,7,1,1


## Adding a record

In [41]:
#adding a record to the botom of the dataframe 
df.loc[7] = ["Jess",25,2,"Adult"]
df

Unnamed: 0,names,ages,tenure,age_group
6,Rebecca,18,4,Teenager
5,Tyler,20,4,Teenager
2,Sandy,22,1,Adult
0,Jess,25,6,Adult
3,Ted,29,7,Adult
4,Barney,33,2,Adult
1,Jordon,35,4,Adult
7,Jess,25,2,Adult


In [42]:
#removing duplicates based on same names
df = df.drop_duplicates(subset = "names")
df

Unnamed: 0,names,ages,tenure,age_group
6,Rebecca,18,4,Teenager
5,Tyler,20,4,Teenager
2,Sandy,22,1,Adult
0,Jess,25,6,Adult
3,Ted,29,7,Adult
4,Barney,33,2,Adult
1,Jordon,35,4,Adult


## Pandas Joins

In [46]:
#creating another fake dataframe to work with having same names and a new ratings column
ratings = {
    "names" : ["Jess","Tyler","Ted"],
    "ratings" : [10,9,6]
}
ratings = pd.DataFrame.from_dict(ratings)
ratings.head()

Unnamed: 0,names,ratings
0,Jess,10
1,Tyler,9
2,Ted,6


## Inner Join

In [48]:
#performing an inner join with our df & ratings dataframes based on names , to get data that matches
matched_ratings = df.merge(ratings,on="names",how="inner")
matched_ratings.head()

Unnamed: 0,names,ages,tenure,age_group,ratings
0,Tyler,20,4,Teenager,9
1,Jess,25,6,Adult,10
2,Ted,29,7,Adult,6


## Outer Join

In [50]:
#performing an inner join with our df & ratings dataframes based on names , to get all the data
all_ratings = df.merge(ratings,on="names",how="outer")
all_ratings.head(10)

Unnamed: 0,names,ages,tenure,age_group,ratings
0,Rebecca,18,4,Teenager,
1,Tyler,20,4,Teenager,9.0
2,Sandy,22,1,Adult,
3,Jess,25,6,Adult,10.0
4,Ted,29,7,Adult,6.0
5,Barney,33,2,Adult,
6,Jordon,35,4,Adult,


## Dataset Pipeline

### 1.Perform Exploratory Analysis

### 2.Data Cleaning

### 3.Feature Engineering