In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("../data/BigML_Dataset_5f50a62795a9306aa200003e.csv")
df.head()

Unnamed: 0,Age,Sex,Race,Marital status?,Education,Employement,Incomes,How many books did you read during last 12months?,Read any printed books during last 12months?,Read any audiobooks during last 12months?,Read any e-books during last 12months?,"Last book you read, you…",Do you happen to read any daily news or newspapers?,Do you happen to read any magazines or journals?
0,66,Male,Refused,Divorced,College graduate,Retired,"$20,000 to under $30,000",97,Yes,No,Yes,Purchased the book,No,Yes
1,46,Male,Native American/American Indian,Married,High school graduate,Employed full-time,"Less than $10,000",97,Yes,Yes,Yes,Purchased the book,Yes,Yes
2,32,Male,Mixed race,Never been married,High school graduate,Employed full-time,"Less than $10,000",97,No,Yes,Yes,Borrowed the book from a friend or family member,Yes,Yes
3,27,Male,Mixed race,Married,High school graduate,Employed full-time,"$40,000 to under $50,000",97,Yes,No,Yes,Borrowed the book from a library,Yes,No
4,16,Female,Mixed race,Never been married,High school incomplete,Employed part-time,"$10,000 to under $20,000",97,Yes,Yes,No,Purchased the book,Yes,No


###  Sütun İsimlerinin Temizlenmesi

Sütun isimleri karmaşık ve boşluk/özel karakter içeriyor. Daha kolay analiz için:
- Küçük harfe çeviriyoruz,
- Boşlukları `_` ile değiştiriyoruz,
- `?` ve `-` karakterlerini temizliyoruz.

In [5]:
df.columns = [col.strip().lower().replace(" ", "_").replace("?", "").replace("-", "_") for col in df.columns]
df.columns

Index(['age', 'sex', 'race', 'marital_status', 'education', 'employement',
       'incomes', 'how_many_books_did_you_read_during_last_12months',
       'read_any_printed_books_during_last_12months',
       'read_any_audiobooks_during_last_12months',
       'read_any_e_books_during_last_12months', 'last_book_you_read,_you…',
       'do_you_happen_to_read_any_daily_news_or_newspapers',
       'do_you_happen_to_read_any_magazines_or_journals'],
      dtype='object')

### Eksik Verilerin Kontrolü

In [6]:
df.isnull().sum()

age                                                     0
sex                                                     0
race                                                    0
marital_status                                          0
education                                              58
employement                                             0
incomes                                                 0
how_many_books_did_you_read_during_last_12months        0
read_any_printed_books_during_last_12months           390
read_any_audiobooks_during_last_12months              390
read_any_e_books_during_last_12months                 390
last_book_you_read,_you…                              390
do_you_happen_to_read_any_daily_news_or_newspapers      0
do_you_happen_to_read_any_magazines_or_journals         0
dtype: int64

### Eksik Verilerin Doldurulması

- `education` sütunundaki eksik değerler `"Unknown"` ile dolduruluyor.
- Kitap türleri ile ilgili sütunlardaki boşluklar `"No"` olarak işaretleniyor.

In [7]:
df['education'] = df['education'].fillna('Unknown')

In [9]:
book_columns = [
    'read_any_printed_books_during_last_12months',
    'read_any_audiobooks_during_last_12months',
    'read_any_e_books_during_last_12months',
    'last_book_you_read,_you…'
]



In [10]:
for col in book_columns:
    if col in df.columns:
        df[col] = df[col].fillna('No')

In [11]:
df.isnull().sum()

age                                                   0
sex                                                   0
race                                                  0
marital_status                                        0
education                                             0
employement                                           0
incomes                                               0
how_many_books_did_you_read_during_last_12months      0
read_any_printed_books_during_last_12months           0
read_any_audiobooks_during_last_12months              0
read_any_e_books_during_last_12months                 0
last_book_you_read,_you…                              0
do_you_happen_to_read_any_daily_news_or_newspapers    0
do_you_happen_to_read_any_magazines_or_journals       0
dtype: int64

In [12]:
df.isnull().values.any()

np.False_

In [13]:
df.to_csv("../data/cleaned_reading_data.csv", index=False)