<a href="https://colab.research.google.com/github/fundaylncii/FeatureEngineering/blob/main/FeatureExtraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
## Ham veriden değişken türetme işlemidir.
## Yapısal veya yapısal olmayan(resim,ses) gibi değerlerden yeni değişkenler türetilebilir.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import missingno as msno
from datetime import date
from sklearn.metrics  import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, RobustScaler

In [2]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_row",None)
pd.set_option("display.float_format", lambda x: "%.3f" % x)
pd.set_option("display.width", 500)

In [3]:
## Binary Features: var olan değişkenler içerisinde 1-0 değerlerini içeren yeni değişkenlerin türetilmesi

df_ = pd.read_csv("/content/titanic.csv")
df = df_.copy()
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.283,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
## titanic veri setinde cabin değişkeninde null veya dolu değerler için 1-0 değerlerinden bir değişken oluşturulabilir
df["NEW_CABIN_BOOL"] = df["Cabin"].notnull().astype("int")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,NEW_CABIN_BOOL
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.283,C85,C,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0


In [9]:
## bağımlı değişken ile yeni oluşturulan değişken arasında oran testinin yapılması:
from statsmodels.stats.proportion import proportions_ztest

test_stat, pvalue = proportions_ztest(count= [df.loc[df["NEW_CABIN_BOOL"] == 1, "Survived"].sum(),
                                      df.loc[df["NEW_CABIN_BOOL"] == 0 , "Survived"].sum()],

                                      nobs = [df.loc[df["NEW_CABIN_BOOL"] == 1, "Survived"].shape[0],
                                              df.loc[df["NEW_CABIN_BOOL"] == 0 , "Survived"].shape[0]])

print("Test Stat = %.4f, pvalue = %.4f" % (test_stat,pvalue))

Test Stat = 9.4597, pvalue = 0.0000


Cabin numarası olmayanların hayatta kalma oranı ile olanların hayatta kalma oranının z testi sonucunda pvalue: 0.000 < 0.05 olduğu için h0 reddedilemez. Yani iki değer arasında istatistiki olarak anlamlı fark vardır.

In [10]:
## SibSp ve Parch değişkenlerin yolcuların yanındaki kişilerin sayısını belirtir. Eğer iki değişken toplamı 0 dan büyük ise yolcu yalnız değildir.
df.loc[((df["SibSp"] + df["Parch"]) > 0) , "NEW_IS_ALONE"] = "NO"
df.loc[((df["SibSp"] + df["Parch"]) == 0) , "NEW_IS_ALONE"] = "YES"
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,NEW_CABIN_BOOL,NEW_IS_ALONE
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,NO
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.283,C85,C,1,NO
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,YES
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1,NO
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,YES


In [11]:
## yalnız olanlar ile olmayanlar arasında hayatta kalma oranına göre fark var gibi
df.groupby("NEW_IS_ALONE").agg({"Survived": "mean"})

Unnamed: 0_level_0,Survived
NEW_IS_ALONE,Unnamed: 1_level_1
NO,0.506
YES,0.304


In [12]:
## Bu değişkene istinaden oran testi yapılabilir.
from statsmodels.stats.proportion import proportions_ztest

test_stat, pvalue = proportions_ztest(count= [df.loc[df["NEW_IS_ALONE"] == "YES", "Survived"].sum(),
                                      df.loc[df["NEW_IS_ALONE"] == "NO" , "Survived"].sum()],

                                      nobs = [df.loc[df["NEW_IS_ALONE"] == "YES", "Survived"].shape[0],
                                              df.loc[df["NEW_IS_ALONE"] == "NO" , "Survived"].shape[0]])

print("Test Stat = %.4f, pvalue = %.4f" % (test_stat,pvalue))

Test Stat = -6.0704, pvalue = 0.0000


Yalnız olanların hayatta kalma oranı ile yalnız olmayanların hayatta kalma oranının z testi sonucunda pvalue: 0.000 < 0.05 olduğu için h0 reddedilemez. Yani iki değer arasında istatistiki olarak anlamlı fark vardır.

In [16]:
## TEXT FEATURE:

df = df_.copy()
df.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.283,C85,C


In [17]:
## Letter Count:
df["NEW_NAME_COUNT"] = df["Name"].str.len()
df.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,NEW_NAME_COUNT
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,23
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.283,C85,C,51


In [19]:
## WORD COUNT:
df["NEW_NAME_WORD_COUNT"] = df["Name"].apply(lambda x: len(str(x).split(" ")))
df.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,NEW_NAME_COUNT,NEW_NAME_WORD_COUNT
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,23,4
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.283,C85,C,51,7


In [22]:
## ÖZEL YAPILARI YAKALAMAK:
## name alanında dr. terimine göe yolcunun doktor olma durumunun çıkartılması

df["NEW_NAME_DR"] = df["Name"].apply(lambda x: len([x for x in x.split() if x.startswith("Dr")]))
df.groupby("NEW_NAME_DR").agg({"Survived": "mean"})

Unnamed: 0_level_0,Survived
NEW_NAME_DR,Unnamed: 1_level_1
0,0.383
1,0.5


Doktor olanların hayatta kalma oranı daha yüksek!!

In [23]:
## REGEX FEATURES:
df["NEW_TITLE"] = df.Name.str.extract(" ([A-Za-z]+)\.", expand=False)
df.head(2)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,NEW_NAME_COUNT,NEW_NAME_WORD_COUNT,NEW_NAME_DR,NEW_TITLE
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,23,4,0,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.283,C85,C,51,7,0,Mrs


In [24]:
df[["NEW_TITLE","Survived","Age"]].groupby("NEW_TITLE").agg({"Survived": "mean",
                                                             "Age": ["count", "mean"]})

Unnamed: 0_level_0,Survived,Age,Age
Unnamed: 0_level_1,mean,count,mean
NEW_TITLE,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Capt,0.0,1,70.0
Col,0.5,2,58.0
Countess,1.0,1,33.0
Don,0.0,1,40.0
Dr,0.429,6,42.0
Jonkheer,0.0,1,38.0
Lady,1.0,1,48.0
Major,0.5,2,48.5
Master,0.575,36,4.574
Miss,0.698,146,21.774


In [30]:
## DATE FEATURES:
dff = pd.read_csv("/content/course_reviews.csv")
dff.head()

Unnamed: 0,Rating,Timestamp,Enrolled,Progress,Questions Asked,Questions Answered
0,5.0,2021-02-05 07:45:55,2021-01-25 15:12:08,5.0,0.0,0.0
1,5.0,2021-02-04 21:05:32,2021-02-04 20:43:40,1.0,0.0,0.0
2,4.5,2021-02-04 20:34:03,2019-07-04 23:23:27,1.0,0.0,0.0
3,5.0,2021-02-04 16:56:28,2021-02-04 14:41:29,10.0,0.0,0.0
4,4.0,2021-02-04 15:00:24,2020-10-13 03:10:07,10.0,0.0,0.0


In [31]:
dff.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4323 entries, 0 to 4322
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Rating              4323 non-null   float64
 1   Timestamp           4323 non-null   object 
 2   Enrolled            4323 non-null   object 
 3   Progress            4323 non-null   float64
 4   Questions Asked     4323 non-null   float64
 5   Questions Answered  4323 non-null   float64
dtypes: float64(4), object(2)
memory usage: 202.8+ KB


In [33]:
## Tarih ifadeleri object olarak görünmektedir. Bu değişkenin tarih olarak düzenlenmesi gerekmektedir.
dff["Timestamp"] = pd.to_datetime(dff["Timestamp"], format="%Y-%m-%d %H:%M:%S")
dff.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4323 entries, 0 to 4322
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Rating              4323 non-null   float64       
 1   Timestamp           4323 non-null   datetime64[ns]
 2   Enrolled            4323 non-null   object        
 3   Progress            4323 non-null   float64       
 4   Questions Asked     4323 non-null   float64       
 5   Questions Answered  4323 non-null   float64       
dtypes: datetime64[ns](1), float64(4), object(1)
memory usage: 202.8+ KB


In [34]:
dff["year"] = dff["Timestamp"].dt.year
dff["month"] = dff["Timestamp"].dt.month
dff["day"] = dff["Timestamp"].dt.day
dff.head()

Unnamed: 0,Rating,Timestamp,Enrolled,Progress,Questions Asked,Questions Answered,year,month,day
0,5.0,2021-02-05 07:45:55,2021-01-25 15:12:08,5.0,0.0,0.0,2021,2,5
1,5.0,2021-02-04 21:05:32,2021-02-04 20:43:40,1.0,0.0,0.0,2021,2,4
2,4.5,2021-02-04 20:34:03,2019-07-04 23:23:27,1.0,0.0,0.0,2021,2,4
3,5.0,2021-02-04 16:56:28,2021-02-04 14:41:29,10.0,0.0,0.0,2021,2,4
4,4.0,2021-02-04 15:00:24,2020-10-13 03:10:07,10.0,0.0,0.0,2021,2,4


In [36]:
## month diff ( iki tarih arasındaki ay farkı) : yıl farkı + ay farkı
dff["month_diff"] = (date.today().year - dff["Timestamp"].dt.year) * 12 + date.today().month - dff["Timestamp"].dt.month
dff.head()

Unnamed: 0,Rating,Timestamp,Enrolled,Progress,Questions Asked,Questions Answered,year,month,day,month_diff
0,5.0,2021-02-05 07:45:55,2021-01-25 15:12:08,5.0,0.0,0.0,2021,2,5,41
1,5.0,2021-02-04 21:05:32,2021-02-04 20:43:40,1.0,0.0,0.0,2021,2,4,41
2,4.5,2021-02-04 20:34:03,2019-07-04 23:23:27,1.0,0.0,0.0,2021,2,4,41
3,5.0,2021-02-04 16:56:28,2021-02-04 14:41:29,10.0,0.0,0.0,2021,2,4,41
4,4.0,2021-02-04 15:00:24,2020-10-13 03:10:07,10.0,0.0,0.0,2021,2,4,41


In [37]:
## gün isimlerinin oluşturulması:

dff["day_name"] = dff["Timestamp"].dt.day_name()
dff.head()

Unnamed: 0,Rating,Timestamp,Enrolled,Progress,Questions Asked,Questions Answered,year,month,day,month_diff,day_name
0,5.0,2021-02-05 07:45:55,2021-01-25 15:12:08,5.0,0.0,0.0,2021,2,5,41,Friday
1,5.0,2021-02-04 21:05:32,2021-02-04 20:43:40,1.0,0.0,0.0,2021,2,4,41,Thursday
2,4.5,2021-02-04 20:34:03,2019-07-04 23:23:27,1.0,0.0,0.0,2021,2,4,41,Thursday
3,5.0,2021-02-04 16:56:28,2021-02-04 14:41:29,10.0,0.0,0.0,2021,2,4,41,Thursday
4,4.0,2021-02-04 15:00:24,2020-10-13 03:10:07,10.0,0.0,0.0,2021,2,4,41,Thursday


In [39]:
## ÖZELLİK ETKİLEŞİMLERİ:

df = df_.copy()
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.283,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [44]:
df["NEW_AGE_PCLASS"] = df["Age"] * df["Pclass"]
df["NEW_FAMILY_SIZE"] = df["SibSp"] + df["Parch"] + 1
df.loc[(df["Sex"] == "male") & (df["Age"] <= 21), "NEW_SEX_CAT"] = "youngmale"
df.loc[(df["Sex"] == "male") & ((df["Age"] > 21) & (df["Age"] <= 50)), "NEW_SEX_CAT"] = "maturemale"
df.loc[(df["Sex"] == "male") & (df["Age"] > 50), "NEW_SEX_CAT"] = "seniormale"
df.loc[(df["Sex"] == "female") & (df["Age"] <= 21), "NEW_SEX_CAT"] = "youngfemale"
df.loc[(df["Sex"] == "female") & ((df["Age"] > 21) & (df["Age"] <= 50)), "NEW_SEX_CAT"] = "maturefemale"
df.loc[(df["Sex"] == "female") & (df["Age"] > 50), "NEW_SEX_CAT"] = "seniorfemale"
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,NEW_AGE_PCLASS,NEW_FAMILY_SIZE,NEW_SEX_CAT
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,66.0,2,maturemale
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.283,C85,C,38.0,2,maturefemale
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,78.0,1,maturefemale
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,35.0,2,maturefemale
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,105.0,1,maturemale


In [45]:
df.groupby("NEW_SEX_CAT")["Survived"].mean()

NEW_SEX_CAT
maturefemale   0.775
maturemale     0.199
seniorfemale   0.941
seniormale     0.128
youngfemale    0.679
youngmale      0.250
Name: Survived, dtype: float64