In [9]:
import missingno as msno #pip install missingno
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from datetime import date
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, RobustScaler

In [8]:
pip install missingno 

Note: you may need to restart the kernel to use updated packages.


In [2]:
df=pd.read_csv('survey.csv')

In [3]:
df.head()

Unnamed: 0,Timestamp,Age,Gender,Country,state,self_employed,family_history,treatment,work_interfere,no_employees,...,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,comments
0,2014-08-27 11:29:31,37,Female,United States,IL,,No,Yes,Often,6-25,...,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No,
1,2014-08-27 11:29:37,44,M,United States,IN,,No,No,Rarely,More than 1000,...,Don't know,Maybe,No,No,No,No,No,Don't know,No,
2,2014-08-27 11:29:44,32,Male,Canada,,,No,No,Rarely,6-25,...,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,No,
3,2014-08-27 11:29:46,31,Male,United Kingdom,,,Yes,Yes,Often,26-100,...,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes,
4,2014-08-27 11:30:22,31,Male,United States,TX,,No,No,Never,100-500,...,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,No,


In [4]:
df.columns

Index(['Timestamp', 'Age', 'Gender', 'Country', 'state', 'self_employed',
       'family_history', 'treatment', 'work_interfere', 'no_employees',
       'remote_work', 'tech_company', 'benefits', 'care_options',
       'wellness_program', 'seek_help', 'anonymity', 'leave',
       'mental_health_consequence', 'phys_health_consequence', 'coworkers',
       'supervisor', 'mental_health_interview', 'phys_health_interview',
       'mental_vs_physical', 'obs_consequence', 'comments'],
      dtype='object')

* 'Timestamp' -> 'Zaman Damgası', 
* 'Age' -> 'Yaş', 
* 'Gender' -> 'Cinsiyet', 
* 'Country' -> 'Ülke', 
* 'state' -> 'Eyalet', 
* 'self_employed' -> 'Serbest Çalışan', 
* 'family_history' -> 'Aile Geçmişi', 
* 'treatment' -> 'Tedavi', 
* 'work_interfere' -> 'İş Müdahalesi', 
* 'no_employees' -> 'Çalışan Sayısı', 
* 'remote_work' -> 'Uzaktan Çalışma', 
* 'tech_company' -> 'Teknoloji Şirketi', 
* 'benefits' -> 'Faydalar', 
* 'care_options' -> 'Bakım Seçenekleri', 
* 'wellness_program' -> 'Sağlık Programı', 
* 'seek_help' -> 'Yardım Arama', 
* 'anonymity' -> 'Anonimlik', 
* 'leave' -> 'İzin', 
* 'mental_health_consequence' -> 'Ruh Sağlığı Sonucu', 
* 'phys_health_consequence' -> 'Fiziksel Sağlık Sonucu', 
* 'coworkers' -> 'İş Arkadaşları', 
* 'supervisor' -> 'Yönetici', 
* 'mental_health_interview' -> 'Ruh Sağlığı Görüşmesi', 
* 'phys_health_interview' -> 'Fiziksel Sağlık Görüşmesi', 
* 'mental_vs_physical' -> 'Ruhsal vs Fiziksel', 
* 'obs_consequence' -> 'Gözlemlenen Sonuç', 
* 'comments' -> 'Yorumlar'

In [5]:
df.shape

(1259, 27)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1259 entries, 0 to 1258
Data columns (total 27 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Timestamp                  1259 non-null   object
 1   Age                        1259 non-null   int64 
 2   Gender                     1259 non-null   object
 3   Country                    1259 non-null   object
 4   state                      744 non-null    object
 5   self_employed              1241 non-null   object
 6   family_history             1259 non-null   object
 7   treatment                  1259 non-null   object
 8   work_interfere             995 non-null    object
 9   no_employees               1259 non-null   object
 10  remote_work                1259 non-null   object
 11  tech_company               1259 non-null   object
 12  benefits                   1259 non-null   object
 13  care_options               1259 non-null   object
 14  wellness

In [10]:
def grab_col_names(dataframe, cat_th=10, car_th=20):
    """
    Veri setindeki kategorik, numerik ve kategorik fakat kardinal değişkenlerin isimlerini verir.
    Not: Kategorik değişkenlerin içerisine numerik görünümlü kategorik değişkenler de dahildir.

    Parameters
    -----
        dataframe: pandas.DataFrame
            Değişken isimleri alınmak istenilen dataframe
        cat_th: int, optional
            Numerik fakat kategorik olan değişkenler için sınıf eşik değeri (varsayılan: 10)
        car_th: int, optional
            Kategorik fakat kardinal değişkenler için sınıf eşik değeri (varsayılan: 20)

    Returns
    -----
        cat_cols: list
            Kategorik değişken listesi
        num_cols: list
            Numerik değişken listesi
        cat_but_car: list
            Kategorik fakat kardinal değişken listesi
    """
    # Kategorik sütunlar
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtype == "object"]
    
    # Sayısal ama kategorik olan sütunlar
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and dataframe[col].dtype != "object"]
    
    # Kategorik ama sürekli olan sütunlar
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and dataframe[col].dtype == "object"]
    
    # Kategorik sütunları güncelleme
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    # Sayısal sütunlar
    num_cols = [col for col in dataframe.columns if dataframe[col].dtype != "object"]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f"cat_cols: {len(cat_cols)}")
    print(f"num_cols: {len(num_cols)}")
    print(f"cat_but_car: {len(cat_but_car)}")
    print(f"num_but_cat: {len(num_but_cat)}")

    return cat_cols, num_cols, cat_but_car

In [13]:
cat_cols, num_cols, cat_but_car=grab_col_names(df)

Observations: 1259
Variables: 27
cat_cols: 21
num_cols: 1
cat_but_car: 5
num_but_cat: 0


In [17]:
num_cols

['Age']

In [15]:
cat_cols

['self_employed',
 'family_history',
 'treatment',
 'work_interfere',
 'no_employees',
 'remote_work',
 'tech_company',
 'benefits',
 'care_options',
 'wellness_program',
 'seek_help',
 'anonymity',
 'leave',
 'mental_health_consequence',
 'phys_health_consequence',
 'coworkers',
 'supervisor',
 'mental_health_interview',
 'phys_health_interview',
 'mental_vs_physical',
 'obs_consequence']

In [16]:
cat_but_car

['Timestamp', 'Gender', 'Country', 'state', 'comments']