In [2]:
import os

path = "./data/"
os.makedirs(path, exist_ok=True)

In [3]:
data_file = os.path.join(path, "tiny.csv")
with open(data_file, "w") as f:
    f.write('NumRooms,Alley,Price\n') # Column names
    f.write('NA,Pave,127500\n') # Each row represents a data example
    f.write('2,NA,106000\n')
    f.write('4,NA,178100\n')
    f.write('NA,NA,140000\n')


In [4]:
# lets read the file now
import pandas as pd

tiny = pd.read_csv(data_file)

tiny.head()

Unnamed: 0,NumRooms,Alley,Price
0,,Pave,127500
1,2.0,,106000
2,4.0,,178100
3,,,140000


In [6]:
# lets handle missing data

# first seperate input and output

inputs, outputs = tiny.iloc[:,:-1], tiny.iloc[:,-1]

inputs.head(), outputs.head()

(   NumRooms Alley
 0       NaN  Pave
 1       2.0   NaN
 2       4.0   NaN
 3       NaN   NaN,
 0    127500
 1    106000
 2    178100
 3    140000
 Name: Price, dtype: int64)

In [7]:
inputs = inputs.fillna(inputs.mean())

In [8]:
inputs

Unnamed: 0,NumRooms,Alley
0,3.0,Pave
1,2.0,
2,4.0,
3,3.0,


In [9]:
# lets handle categorical data

inputs = pd.get_dummies(inputs, dummy_na=True)

inputs

Unnamed: 0,NumRooms,Alley_Pave,Alley_nan
0,3.0,1,0
1,2.0,0,1
2,4.0,0,1
3,3.0,0,1


In [10]:
# converting to tensor

import torch

X,y = torch.tensor(inputs.values), torch.tensor(outputs.values)

X, y

(tensor([[3., 1., 0.],
         [2., 0., 1.],
         [4., 0., 1.],
         [3., 0., 1.]], dtype=torch.float64),
 tensor([127500, 106000, 178100, 140000]))

In [13]:
new_data_file = os.path.join(path, "trial_12.csv")

tanu = pd.read_csv(new_data_file)

tanu.head()

Unnamed: 0,run_id,condition,success,timeout,failed_images,failed_audio,failed_video,trial_type,trial_index,time_elapsed,...,subject,condition.1,recorded_at,ip,user_agent,device,browser,browser_version,platform,platform_version
0,12,1,True,False,[],[],[],preload,0,2400,...,cf7760ftt62xr0m,Self,2021-03-24 08:03:48,171.76.192.201,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,WebKit,Chrome,89.0.4389.90,Windows,10.0


# Exercises

In [19]:
import random

random.random()

0.49470877574618743

In [22]:
newer_data_file = os.path.join(path, "trail.csv")

new = open(newer_data_file, 'w')

new.write("input1,input2,output\n");

for i in range(10):
    first_value = i
    second_value = "NA" if random.random()> 0.5 else i+2
    new.write(f"{first_value},{second_value},{first_value * 1.5 + 2}\n")

In [56]:
new_df = pd.read_csv(newer_data_file)
new_df

Unnamed: 0,input1,input2,output1
0,0,,2.0
1,1,3.0,3.5
2,2,,5.0
3,3,,6.5
4,4,6.0,8.0
5,5,7.0,9.5
6,6,8.0,11.0
7,7,,12.5
8,8,,14.0
9,9,,15.5


In [57]:
new_df.iloc[0,1]

' NA'

In [58]:
max_nan = 0
max_nan_number = 0
for i in range(len(new_df.columns)):
    current_nan = 0
    for j in range(len(new_df)):
        if new_df.iloc[j, i] == " NA":
            current_nan += 1
        
    if current_nan>max_nan_number:
        max_nan_number = current_nan
        max_nan = i

print(max_nan, max_nan_number)

1 6


In [59]:
new_df.columns

Index(['input1', ' input2', ' output1'], dtype='object')

In [48]:
# deleting the column with max nan number

del_df_1 = new_df.iloc[:,:max_nan]
del_df_2 = new_df.iloc[:,max_nan+1:]

del_df = pd.concat([del_df_1, del_df_2], axis=0)

del_df

Unnamed: 0,input1,output1
0,0.0,
1,1.0,
2,2.0,
3,3.0,
4,4.0,
5,5.0,
6,6.0,
7,7.0,
8,8.0,
9,9.0,


In [49]:
del_df = pd.concat([del_df_1, del_df_2], axis=1)

del_df

Unnamed: 0,input1,output1
0,0,2.0
1,1,3.5
2,2,5.0
3,3,6.5
4,4,8.0
5,5,9.5
6,6,11.0
7,7,12.5
8,8,14.0
9,9,15.5


In [64]:
# simpler way to do this will be

(new_df[' input2']== " NA").sum()

6

In [67]:
# simple way to delete

new_df.drop(' input2', axis=1)

Unnamed: 0,input1,output1
0,0,2.0
1,1,3.5
2,2,5.0
3,3,6.5
4,4,8.0
5,5,9.5
6,6,11.0
7,7,12.5
8,8,14.0
9,9,15.5


In [50]:
# convert into tensor

torch.tensor(del_df.values)

tensor([[ 0.0000,  2.0000],
        [ 1.0000,  3.5000],
        [ 2.0000,  5.0000],
        [ 3.0000,  6.5000],
        [ 4.0000,  8.0000],
        [ 5.0000,  9.5000],
        [ 6.0000, 11.0000],
        [ 7.0000, 12.5000],
        [ 8.0000, 14.0000],
        [ 9.0000, 15.5000]], dtype=torch.float64)