In [None]:
# Contents:
## 1. Introduction to Data Preprocessing
## 2. Standardizing Data
## 3. Feature Engineering
## 4. Selecting features for modeling
## 5. Putting it all together

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [116]:
hiking = pd.read_json('https://assets.datacamp.com/production/repositories/1816/datasets/4f26c48451bdbf73db8a58e226cd3d6b45cf7bb5/hiking.json')
wine = pd.read_csv('https://assets.datacamp.com/production/repositories/1816/datasets/9bd5350dfdb481e0f94eeef6acf2663452a8ef8b/wine_types.csv')
ufo = pd.read_csv('https://assets.datacamp.com/production/repositories/1816/datasets/a5ebfe5d2ed194f2668867603b563963af4769e9/ufo_sightings_large.csv')
volunteer = pd.read_csv('https://assets.datacamp.com/production/repositories/1816/datasets/668b96955d8b252aa8439c7602d516634e3f015e/volunteer_opportunities.csv')




In [10]:
# 1. Introduction to Data Preprocessing

In [9]:
# Check how many values are missing in the category_desc column
print(volunteer['category_desc'].isnull().sum())

# Subset the volunteer dataset
volunteer_subset = volunteer[volunteer['category_desc'].notnull()]

# Print out the shape of the subset
print(volunteer_subset.shape)

48
(617, 35)


In [11]:
# Print the head of the hits column
print(volunteer["hits"].head())

# Convert the hits column to type int
volunteer["hits"] = volunteer["hits"].astype(int)

# Look at the dtypes of the dataset
print(volunteer.dtypes)

0    737
1     22
2     62
3     14
4     31
Name: hits, dtype: int64
opportunity_id          int64
content_id              int64
vol_requests            int64
event_time              int64
title                  object
hits                    int64
summary                object
is_priority            object
category_id           float64
category_desc          object
amsl                  float64
amsl_unit             float64
org_title              object
org_content_id          int64
addresses_count         int64
locality               object
region                 object
postalcode            float64
primary_loc           float64
display_url            object
recurrence_type        object
hours                   int64
created_date           object
last_modified_date     object
start_date_date        object
end_date_date          object
status                 object
Latitude              float64
Longitude             float64
Community Board       float64
Community Council     float64


In [None]:
## 1.2 Class distribution - Stratified sampling

In [14]:
# Stratified sampling
from sklearn.model_selection import train_test_split

# Create a data with all columns except category_desc
volunteer_X = volunteer.drop('category_desc', axis=1)

# Create a category_desc labels dataset
volunteer_y = volunteer[['category_desc']]

# Use stratified sampling to split up the dataset according to the volunteer_y dataset
X_train, X_test, y_train, y_test = train_test_split(volunteer_X, volunteer_y, stratify=volunteer_y)

# Print out the category_desc counts on the training y labels
print(y_train['category_desc'].value_counts())

Strengthening Communities    230
Helping Neighbors in Need     89
Education                     69
Health                        39
Environment                   24
Emergency Preparedness        11
Name: category_desc, dtype: int64


# 2. Standardizing Data

What is standardization?
- Scikit-learn models assume normally distributed data
- Log normalization and feature scaling in this course
- Applied to continuous numerical data

When to standardize: models
- Model in linear space
- Dataset features have high variance
- Dataset features are continuous and on different scales. Not for categorical data.
- Linearity assumptions



## 2.1 Modeling without normalizing

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

wine_X = wine[['Proline','Total phenols','Hue','Nonflavanoid phenols']]
wine_y = wine['Type']

# Split the dataset and labels into training and test sets
X_train, X_test, y_train, y_test = train_test_split(wine_X, wine_y)

# instantiate Knn classifier 
knn = KNeighborsClassifier()

# Fit the k-nearest neighbors model to the training data
knn.fit(X_train,y_train)

# Score the model on the test data
print(knn.score(X_test,y_test))

0.7333333333333333


## 2.2 Log normalization

<img src = './Images/PP-log.png' width = 400 align = "left"  >

In [28]:
# creating copy of wine
wine_copy = wine.copy()

# checking variance in each col
print(wine_copy.var())

# Print out the variance of the Proline column
print('\n variance of the Proline column',wine_copy['Proline'].var())

# Apply the log normalization function to the Proline column
wine_copy['Proline_log'] = np.log(wine_copy['Proline'])

# Check the variance of the normalized Proline column
print('\n variance of the normalized Proline column',wine_copy['Proline_log'].var())

Type                                0.600679
Alcohol                             0.659062
Malic acid                          1.248015
Ash                                 0.075265
Alcalinity of ash                  11.152686
Magnesium                         203.989335
Total phenols                       0.391690
Flavanoids                          0.997719
Nonflavanoid phenols                0.015489
Proanthocyanins                     0.327595
Color intensity                     5.374449
Hue                                 0.052245
OD280/OD315 of diluted wines        0.504086
Proline                         99166.717355
dtype: float64

 variance of the Proline column 99166.71735542428

 variance of the normalized Proline column 0.17231366191842018


## 2.3 Scaling

What is feature scaling?
- Features on different scales
- Model with linear characteristics
- Center features around 0 and transform to unit variance
- Transforms to approximately normal distribution



In [25]:
# Import StandardScaler from scikit-learn
from sklearn.preprocessing import StandardScaler

# Create the scaler
ss = StandardScaler()

# Take a subset of the DataFrame you want to scale 
wine_subset = wine[['Ash','Alcalinity of ash','Magnesium']]

# Apply the scaler to the DataFrame subset
wine_subset_scaled = ss.fit_transform(wine_subset)

## 2.4 KNN on non-scaled data

In [33]:
wine_x2 = wine.drop('Type',axis=1)
wine_y2 = wine['Type']

from sklearn.neighbors import KNeighborsClassifier

# Split the dataset and labels into training and test sets
X_train, X_test, y_train, y_test = train_test_split(wine_x2,wine_y2)

# instantiate Knn classifier 
knn = KNeighborsClassifier()

# Fit the k-nearest neighbors model to the training data
knn.fit(X_train, y_train)

# Score the model on the test data
print(knn.score(X_test, y_test))

0.7333333333333333


## 2.5 KNN on scaled data

- The accuracy score on the unscaled wine dataset was decent, but we can likely do better if we scale the dataset. 

In [35]:
# Create the scaling method.
ss = StandardScaler()

# Apply the scaling method to the dataset used for modeling.
X_scaled = ss.fit_transform(wine_x2)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, wine_y2)

# Fit the k-nearest neighbors model to the training data.
knn.fit(X_train,y_train)

# Score the model on the test data.
print(knn.score(X_test,y_test))

print('\n The increase in accuracy is worth the extra step of scaling the dataset.')

0.9555555555555556

 The increase in accuracy is worth the extra step of scaling the dataset.


# 3. Feature engineering

## 3.1 Encoding categorical variables - binary

In [39]:
from sklearn.preprocessing import LabelEncoder

# Set up the LabelEncoder object
enc = LabelEncoder()

# creating a copy of hiking data
hiking_copy = hiking.copy()

# Apply the encoding to the "Accessible" column
hiking_copy['Accessible_enc'] = enc.fit_transform(hiking_copy['Accessible'])

# Compare the two columns
print(hiking_copy[['Accessible_enc', 'Accessible']].head())

   Accessible_enc Accessible
0               1          Y
1               0          N
2               0          N
3               0          N
4               0          N


In [40]:
# Transform the category_desc column
category_enc = pd.get_dummies(volunteer.copy()["category_desc"])

# Take a look at the encoded columns
print(category_enc.head())

   Education  Emergency Preparedness  Environment  Health  \
0          0                       0            0       0   
1          0                       0            0       0   
2          0                       0            0       0   
3          0                       0            0       0   
4          0                       0            1       0   

   Helping Neighbors in Need  Strengthening Communities  
0                          0                          0  
1                          0                          1  
2                          0                          1  
3                          0                          1  
4                          0                          0  


In [None]:
## 3.2 Engineering numerical features

In [45]:
volunteer_start = volunteer.copy()

# First, convert string column to date column
volunteer_start["start_date_converted"] = pd.to_datetime(volunteer_start["start_date_date"])

# Extract just the month from the converted column
volunteer_start["start_date_month"] = volunteer_start["start_date_converted"].apply(lambda row: row.month)

# Take a look at the converted and new month columns
print(volunteer_start[['start_date_converted', 'start_date_month']].head())

  start_date_converted  start_date_month
0           2011-07-30                 7
1           2011-02-01                 2
2           2011-01-29                 1
3           2011-02-14                 2
4           2011-02-05                 2


In [None]:
## 3.3 Engineering features from strings - extraction

In [77]:
hiking_mile = hiking.copy()
hiking_mile = hiking_mile[hiking_mile['Length'].notna()]

import re 

# Write a pattern to extract numbers and decimals
def return_mileage(length):
    pattern = re.compile(r"\d+\.\d+")
    
    # Search the text for matches
    mile = re.match(pattern, length)
    
    # If a value is returned, use group(0) to return the found value
    if mile is not None:
        return float(mile.group(0))
        
# Apply the function to the Length column and take a look at both columns
hiking_mile["Length_num"] = hiking_mile["Length"].apply(lambda row: return_mileage(row))
print(hiking_mile[["Length", "Length_num"]].head())

       Length  Length_num
0   0.8 miles        0.80
1    1.0 mile        1.00
2  0.75 miles        0.75
3   0.5 miles        0.50
4   0.5 miles        0.50


## 3.4 Engineering features from strings - tf/idf

In [107]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB 

# Take the title text
title_text = volunteer[volunteer['category_desc'].notna()]['title']

# Create the vectorizer method
tfidf_vec = TfidfVectorizer()

# Transform the text into tf-idf vectors
text_tfidf = tfidf_vec.fit_transform(title_text)

# Split the dataset according to the class distribution of category_desc
y = volunteer["category_desc"].dropna()
X_train, X_test, y_train, y_test = train_test_split(text_tfidf.toarray(), y, stratify=y)

# instantiate nb algo
nb = GaussianNB()
# Fit the model to the training data
nb.fit(X_train, y_train)

# Print out the model's accuracy
print(nb.score(X_test, y_test))


0.535483870967742


# 4. Selecting features for modeling

- Redundant features
- Remove noisy features
- Remove correlated features
- Remove duplicated features


## 4.1 Selecting relevant features

In [167]:
volunteer_rf = volunteer[['vol_requests','title','hits','category_desc','locality','region','postalcode','created_date']]
volunteer_rf['vol_requests_lognorm'] = np.log(volunteer_rf['vol_requests'])

# First, convert string column to date column
volunteer_rf["created_date"] = pd.to_datetime(volunteer_rf["created_date"])

# creating month col
volunteer_rf["created_month"] = volunteer_rf["created_date"].apply(lambda row: row.month)

# get dummies
category_enc = pd.get_dummies(volunteer_rf['category_desc'])

# concat for final df
volunteer_rf = pd.concat([volunteer_rf,category_enc],axis=1)

# treating na
volunteer_rf = volunteer_rf[volunteer_rf['category_desc'].notna()]
volunteer_rf = volunteer_rf[volunteer_rf['postalcode'].notna()]

In [169]:
# Create a list of redundant column names to drop
to_drop = ["locality", "region", "category_desc", "created_date", "vol_requests"]

# Drop those columns from the dataset
volunteer_subset = volunteer_rf.drop(to_drop, axis=1)

# Print out the head of the new dataset
print(volunteer_subset.head())

                                               title  hits  postalcode  \
1                                       Web designer    22     10010.0   
2      Urban Adventures - Ice Skating at Lasker Rink    62     10026.0   
3  Fight global hunger and support women farmers ...    14      2114.0   
4                                      Stop 'N' Swap    31     10455.0   
5                               Queens Stop 'N' Swap   135     11372.0   

   vol_requests_lognorm  created_month  Education  Emergency Preparedness  \
1              0.693147              1          0                       0   
2              2.995732              1          0                       0   
3              6.214608              1          0                       0   
4              2.708050              1          0                       0   
5              2.708050              1          0                       0   

   Environment  Health  Helping Neighbors in Need  Strengthening Communities  
1            

## 4.2 Checking for correlated features

In [175]:
wine_cor = wine[['Flavanoids','Total phenols','Malic acid','OD280/OD315 of diluted wines','Hue']]


In [176]:
# Print out the column correlations of the wine dataset
print(wine_cor.corr())

# Take a minute to find the column where the correlation value is greater than 0.75 at least twice
to_drop = "Flavanoids"

# Drop that column from the DataFrame
wine_cor = wine_cor.drop(to_drop, axis=1)

print('\n Dropping correlated features is often an iterative process, so you may need to try different combinations in your model.')


                              Flavanoids  Total phenols  Malic acid  \
Flavanoids                      1.000000       0.864564   -0.411007   
Total phenols                   0.864564       1.000000   -0.335167   
Malic acid                     -0.411007      -0.335167    1.000000   
OD280/OD315 of diluted wines    0.787194       0.699949   -0.368710   
Hue                             0.543479       0.433681   -0.561296   

                              OD280/OD315 of diluted wines       Hue  
Flavanoids                                        0.787194  0.543479  
Total phenols                                     0.699949  0.433681  
Malic acid                                       -0.368710 -0.561296  
OD280/OD315 of diluted wines                      1.000000  0.565468  
Hue                                               0.565468  1.000000  

 Dropping correlated features is often an iterative process, so you may need to try different combinations in your model.


## 4.3 Selecting features using text vectors

In [178]:
# Refer to the course

### 4.3.2 PCA - principal component analysis

In [182]:
from sklearn.decomposition import PCA

# Set up PCA and the X vector for diminsionality reduction
pca = PCA()
wine_pca = wine.drop("Type", axis=1)
wine_pca_y = wine['Type']

# Apply PCA to the wine dataset X vector
transformed_X = pca.fit_transform(wine_X)

# Look at the percentage of variance explained by the different components
print(pca.explained_variance_ratio_)

[9.99996392e-01 3.07041250e-06 4.16606366e-07 1.21244332e-07]


In [184]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

# Split the transformed X and the y labels into training and test sets
X_wine_train, X_wine_test, y_wine_train, y_wine_test = train_test_split(transformed_X,wine_pca_y)

# Fit knn to the training data
knn.fit(X_wine_train,y_wine_train)

# Score knn on the test data and print it out
knn.score(X_wine_test,y_wine_test)

0.6444444444444445

# 5. Bringing it all together

In [216]:
ufo_at = ufo.copy()

# Check the column types
print(ufo_at.dtypes)

# Change the type of seconds to float
ufo_at["seconds"] = ufo_at["seconds"].astype(float)

# Change the date column to type datetime
ufo_at["date"] = pd.to_datetime(ufo_at["date"])

# Check the column types
print('\n',ufo_at[["seconds", "date"]].dtypes)

date               object
city               object
state              object
country            object
type               object
seconds           float64
length_of_time     object
desc               object
recorded           object
lat                object
long              float64
dtype: object

 seconds           float64
date       datetime64[ns]
dtype: object


In [217]:
# Check how many values are missing in the length_of_time, state, and type columns
print(ufo_at[['length_of_time', 'state', 'type']].isnull().sum())

# Keep only rows where length_of_time, state, and type are not null
ufo_no_missing = ufo_at[ufo_at['length_of_time'].notnull() & 
          ufo_at['state'].notnull() & 
          ufo_at['type'].notnull()]

# Print out the shape of the new dataset
print('\n',ufo_no_missing.shape)

length_of_time    143
state             419
type              159
dtype: int64

 (4283, 11)


In [218]:
def return_minutes(time_string):

    # Use \d+ to grab digits
    pattern = re.compile(r"\d+")
    
    # Use match on the pattern and column
    num = re.match(pattern, time_string)
    if num is not None:
        return int(num.group(0))
        
# Apply the extraction to the length_of_time column
ufo_no_missing["minutes"] = ufo_no_missing["length_of_time"].apply(lambda x: return_minutes(x))

# Take a look at the head of both of the columns
print(ufo_no_missing[['length_of_time','minutes']].head())

    length_of_time  minutes
0          2 weeks      2.0
1           30sec.     30.0
3  about 5 minutes      NaN
4                2      2.0
5       10 minutes     10.0


In [219]:
# Check the variance of the seconds and minutes columns
print(ufo_no_missing[['seconds','minutes']].var())

# Log normalize the seconds column
ufo_no_missing["seconds_log"] = np.log(ufo_no_missing["seconds"])

# Print out the variance of just the seconds_log column
print(ufo_no_missing['seconds_log'].var())

seconds    1.545212e+10
minutes    9.470577e+02
dtype: float64
nan


In [220]:
# Use Pandas to encode us values as 1 and others as 0
ufo_no_missing["country_enc"] = ufo_no_missing["country"].apply(lambda x: 1 if x == 'us' else 0)

# Print the number of unique type values
print(len(ufo_no_missing['type'].unique()))

# Create a one-hot encoded set of the type values
type_set = pd.get_dummies(ufo_no_missing['type'])

# Concatenate this set back to the ufo DataFrame
ufo_no_missing = pd.concat([ufo_no_missing, type_set], axis=1)

21


In [221]:
# Look at the first 5 rows of the date column
print(ufo_no_missing['date'].head())

# Extract the month from the date column
ufo_no_missing["month"] = ufo_no_missing["date"].apply(lambda row: row.month)

# Extract the year from the date column
ufo_no_missing["year"] = ufo_no_missing["date"].apply(lambda row: row.year)

# Take a look at the head of all three columns
print('\n',ufo_no_missing[['date','month','year']].head(5))

0   2011-11-03 19:21:00
1   2004-10-03 19:05:00
3   2002-11-21 05:45:00
4   2010-08-19 12:55:00
5   2012-06-16 23:00:00
Name: date, dtype: datetime64[ns]

                  date  month  year
0 2011-11-03 19:21:00     11  2011
1 2004-10-03 19:05:00     10  2004
3 2002-11-21 05:45:00     11  2002
4 2010-08-19 12:55:00      8  2010
5 2012-06-16 23:00:00      6  2012


In [222]:
# Take a look at the head of the desc field
print(ufo_no_missing["desc"].head())

# Create the tfidf vectorizer object
vec = TfidfVectorizer()

# Use vec's fit_transform method on the desc field
desc_tfidf = vec.fit_transform(ufo_no_missing["desc"])

# Look at the number of columns this creates
print('\n',desc_tfidf.shape)

0    Red blinking objects similar to airplanes or s...
1                 Many fighter jets flying towards UFO
3    It was a large&#44 triangular shaped flying ob...
4       A white spinning disc in the shape of an oval.
5    Dancing lights that would fly around and then ...
Name: desc, dtype: object

 (4283, 5754)


In [223]:
def words_to_filter(vocab, original_vocab, vector, top_n):
    filter_list = []
    for i in range(0, vector.shape[0]):
    
        # Here we'll call the function from the previous exercise, and extend the list we're creating
        filtered = return_weights(vocab, original_vocab, vector, i, top_n)
        filter_list.extend(filtered)
    # Return the list in a set, so we don't get duplicate word indices
    return set(filter_list)


vocab = {v:k for k,v in 
   vec.vocabulary_.items()}

In [243]:
# Check the correlation between the seconds, seconds_log, and minutes columns
print(ufo_no_missing[['seconds','seconds_log','minutes']].corr())

# Make a list of features to drop
to_drop = ['city','country','country_enc','date','desc','lat','length_of_time','long','minutes','recorded','seconds','state','seconds_log']

# Drop those features
ufo_dropped = ufo_no_missing.drop(to_drop,axis=1)

# Let's also filter some words out of the text vector we created
filtered_words = words_to_filter(vocab, vec.vocabulary_, desc_tfidf, 4)

              seconds  seconds_log   minutes
seconds      1.000000     0.174331 -0.009932
seconds_log  0.174331     1.000000  0.111460
minutes     -0.009932     0.111460  1.000000


In [245]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

# Creating feature and target variable
ufo_dropped_X = ufo_dropped.drop('type',axis=1)
ufo_dropped_y = ufo_dropped['type']

# Take a look at the features in the X set of data
print(ufo_dropped_X.columns)

# Split the X and y sets using train_test_split, setting stratify=y
train_X, test_X, train_y, test_y = train_test_split(ufo_dropped_X,ufo_dropped_y,stratify=ufo_dropped_y)

# instantiate knn classifier
knn = KNeighborsClassifier()

# Fit knn to the training sets
knn.fit(train_X,train_y)

# Print the score of knn on the test sets
print('\n',knn.score(test_X,test_y))

Index(['changing', 'chevron', 'cigar', 'circle', 'cone', 'cross', 'cylinder',
       'diamond', 'disk', 'egg', 'fireball', 'flash', 'formation', 'light',
       'other', 'oval', 'rectangle', 'sphere', 'teardrop', 'triangle',
       'unknown', 'month', 'year'],
      dtype='object')

 0.811391223155929
