## Part 1:
Using data found in sales.csv perform the following tasks:

1. Standardized the Price column
2. Identify and remove outliers in the datasets

# Standardizing Column Values

In [1]:
import pandas as pd
import numpy as np

In [12]:
sales = pd.read_csv("Fradejas_sales.csv")
sales.describe(include="all")

Unnamed: 0,Transaction_date,Product,Price,Payment_Type,Name,City,State,Country,Account_Created,Last_Login,Latitude,Longitude
count,997,997,999.0,997,1000,996,995,1000,997,993,997.0,999.0
unique,979,4,12.0,4,761,752,204,56,968,965,,
top,1/8/2009 15:27,Product1,1200.0,Visa,Sarah,London,England,United States,6/3/2008 4:22,2/26/2009 7:32,,
freq,2,846,838.0,520,11,19,84,466,3,3,,
mean,,,,,,,,,,,38.984398,-41.417494
std,,,,,,,,,,,19.499041,67.377526
min,,,,,,,,,,,-41.465,-159.48528
25%,,,,,,,,,,,35.7825,-88.207775
50%,,,,,,,,,,,42.30972,-73.73389
75%,,,,,,,,,,,51.05,4.85


In [13]:
# remove "," in price
sales = sales.replace(',','', regex=True)

In [15]:
# convert data frame to float
sales['Price'] = sales['Price'].astype(float)

In [16]:
sales.describe(include="all")

Unnamed: 0,Transaction_date,Product,Price,Payment_Type,Name,City,State,Country,Account_Created,Last_Login,Latitude,Longitude
count,997,997,999.0,997,1000,996,995,1000,997,993,997.0,999.0
unique,979,4,,4,761,752,204,56,968,965,,
top,1/8/2009 15:27,Product1,,Visa,Sarah,London,England,United States,6/3/2008 4:22,2/26/2009 7:32,,
freq,2,846,,520,11,19,84,466,3,3,,
mean,,,1650.970971,,,,,,,,38.984398,-41.417494
std,,,1210.577115,,,,,,,,19.499041,67.377526
min,,,20.0,,,,,,,,-41.465,-159.48528
25%,,,1200.0,,,,,,,,35.7825,-88.207775
50%,,,1200.0,,,,,,,,42.30972,-73.73389
75%,,,1200.0,,,,,,,,51.05,4.85


In [23]:
# insert new column for standardized price
sales.insert(3, "standardized_price", 0)

In [25]:
sales.standardized_price = (sales.Price - np.mean(sales.Price))/(np.std(sales.Price))

In [1]:
# look for uncommong values using values count
sales.standardized_price.value_counts()

NameError: name 'sales' is not defined

In [39]:
# drop values with standardized values greater than 5
sales_new = sales.drop(sales[sales['standardized_price'] > 5].index)

In [43]:
sales_new.standardized_price.value_counts()

-0.372712    838
 1.610806    137
 4.834024     15
-1.157855      1
 0.371107      1
-1.347942      1
 0.123167      1
-0.331389      1
-0.703299      1
Name: standardized_price, dtype: int64

In [41]:
# new sales data removing unecessary columns 
sales_new.describe(include="all")

Unnamed: 0,Transaction_date,Product,Price,standardized_price,Payment_Type,Name,City,State,Country,Account_Created,Last_Login,Latitude,Longitude
count,994,994,996.0,996.0,994,997,993,992,997,994,990,994.0,996.0
unique,976,4,,,4,759,750,204,56,966,963,,
top,1/8/2009 15:27,Product1,,,Visa,Sarah,London,England,United States,1/18/2009 0:00,2/26/2009 7:32,,
freq,2,843,,,518,11,19,84,464,3,3,,
mean,,,1624.016064,-0.022277,,,,,,,,38.975401,-41.312716
std,,,1102.316855,0.911027,,,,,,,,19.52228,67.396924
min,,,20.0,-1.347942,,,,,,,,-41.465,-159.48528
25%,,,1200.0,-0.372712,,,,,,,,35.816944,-88.027987
50%,,,1200.0,-0.372712,,,,,,,,42.320695,-73.730695
75%,,,1200.0,-0.372712,,,,,,,,51.033333,4.916667


## Part 2:
Using data found in sales.csv perform the following tasks:

1. Standardized the Price column
2. Identify and remove outliers in the datasets

In [63]:
import pandas as pd
import numpy as np

values = pd.read_csv("Fradejas_missing_values.csv")
values.describe(include="all")

Unnamed: 0,ID,Sex,Age,Income,Employed,Children,Buy_Car
count,20.0,16,20.0,17.0,20,18.0,20
unique,,2,,,2,,2
top,,Male,,,Married,,No
freq,,10,,,15,,11
mean,10.5,,28.15,26245.705882,,1.222222,
std,5.91608,,1.308877,583.052395,,1.437136,
min,1.0,,25.0,25094.0,,0.0,
25%,5.75,,28.0,26037.0,,0.0,
50%,10.5,,28.0,26234.0,,1.0,
75%,15.25,,29.0,26666.0,,2.0,


In [57]:
# print values
values

Unnamed: 0,ID,Sex,Age,Income,Employed,Children,Buy_Car
0,1,Male,25,25146.0,Single,0.0,No
1,2,Male,30,26939.0,Married,2.0,Yes
2,3,Male,27,26693.0,Married,0.0,No
3,4,Male,28,26666.0,Married,3.0,Yes
4,5,Male,29,25899.0,Married,0.0,No
5,6,Male,28,26462.0,Married,1.0,No
6,7,Female,28,,Married,3.0,Yes
7,8,,30,26037.0,Married,2.0,Yes
8,9,Female,28,26167.0,Married,1.0,Yes
9,10,,28,,Single,,No


In [64]:
# setting all children of 'Singles' to 0
values.loc[(values.Employed == 'Single'),'Children'] = 0
values

Unnamed: 0,ID,Sex,Age,Income,Employed,Children,Buy_Car
0,1,Male,25,25146.0,Single,0.0,No
1,2,Male,30,26939.0,Married,2.0,Yes
2,3,Male,27,26693.0,Married,0.0,No
3,4,Male,28,26666.0,Married,3.0,Yes
4,5,Male,29,25899.0,Married,0.0,No
5,6,Male,28,26462.0,Married,1.0,No
6,7,Female,28,,Married,3.0,Yes
7,8,,30,26037.0,Married,2.0,Yes
8,9,Female,28,26167.0,Married,1.0,Yes
9,10,,28,,Single,0.0,No


In [65]:
# dropping index 10 for having no information and didn't buy a car
new_values = values.drop(9)
new_values

Unnamed: 0,ID,Sex,Age,Income,Employed,Children,Buy_Car
0,1,Male,25,25146.0,Single,0.0,No
1,2,Male,30,26939.0,Married,2.0,Yes
2,3,Male,27,26693.0,Married,0.0,No
3,4,Male,28,26666.0,Married,3.0,Yes
4,5,Male,29,25899.0,Married,0.0,No
5,6,Male,28,26462.0,Married,1.0,No
6,7,Female,28,,Married,3.0,Yes
7,8,,30,26037.0,Married,2.0,Yes
8,9,Female,28,26167.0,Married,1.0,Yes
10,11,Female,28,26905.0,Single,0.0,No


In [67]:
# analyze values where there is NaN
new_values.isnull().sum()

ID          0
Sex         3
Age         0
Income      2
Employed    0
Children    0
Buy_Car     0
dtype: int64

In [69]:
mean_value = new_values['Income'].mean()
mean_value 

26245.70588235294

In [70]:
new_values['Income'] = new_values['Income'].fillna(mean_value) # fill NaN values for Income

In [71]:
new_values

Unnamed: 0,ID,Sex,Age,Income,Employed,Children,Buy_Car
0,1,Male,25,25146.0,Single,0.0,No
1,2,Male,30,26939.0,Married,2.0,Yes
2,3,Male,27,26693.0,Married,0.0,No
3,4,Male,28,26666.0,Married,3.0,Yes
4,5,Male,29,25899.0,Married,0.0,No
5,6,Male,28,26462.0,Married,1.0,No
6,7,Female,28,26245.705882,Married,3.0,Yes
7,8,,30,26037.0,Married,2.0,Yes
8,9,Female,28,26167.0,Married,1.0,Yes
10,11,Female,28,26905.0,Single,0.0,No


In [73]:
new_values[new_values['Age'] == 28]
# knowing the income of people with Age 28, we can safely say that the salary grade is between 26-27K, 
# and we can safely assume that the data we supplied is correct. 

Unnamed: 0,ID,Sex,Age,Income,Employed,Children,Buy_Car
3,4,Male,28,26666.0,Married,3.0,Yes
5,6,Male,28,26462.0,Married,1.0,No
6,7,Female,28,26245.705882,Married,3.0,Yes
8,9,Female,28,26167.0,Married,1.0,Yes
10,11,Female,28,26905.0,Single,0.0,No
13,14,Female,28,26969.0,Married,4.0,Yes
14,15,,28,26234.0,Married,0.0,No
17,18,,28,26245.705882,Single,0.0,Yes
18,19,Male,28,26601.0,Married,1.0,No


In [74]:
new_values # The sex will be kept because sex can be random

Unnamed: 0,ID,Sex,Age,Income,Employed,Children,Buy_Car
0,1,Male,25,25146.0,Single,0.0,No
1,2,Male,30,26939.0,Married,2.0,Yes
2,3,Male,27,26693.0,Married,0.0,No
3,4,Male,28,26666.0,Married,3.0,Yes
4,5,Male,29,25899.0,Married,0.0,No
5,6,Male,28,26462.0,Married,1.0,No
6,7,Female,28,26245.705882,Married,3.0,Yes
7,8,,30,26037.0,Married,2.0,Yes
8,9,Female,28,26167.0,Married,1.0,Yes
10,11,Female,28,26905.0,Single,0.0,No


## Part 3: Application
1. Gather Data From Any Source (it would be better if you use your own data)
2. Preprocess the data you gathered

In [82]:
import pandas as pd
data = pd.read_csv("Fradejas_Student_Profiling.csv")

In [None]:
data = data.drop_duplicates(subset=['Last Name', 'First name']) # this code is used to drop all duplicate entries with the same 
                                                                # last name and first name

In [84]:
data = data.reset_index() # reset the indices 
data[data['HOME ADDRESS: Please enter the name of your PROVINCE'].str.contains('Romblon')] = 'Romblon' 
                          # setting all data to "Romblon" for conistency

In [None]:
data.describe(include="all")


### Due to Data Privacy issues, I can't show you the whole code but this are some of the script used to clean the data 