# Data preparation

### Importing Dataset

In [78]:
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

data = pd.read_csv('HTRU_2.csv', header=None)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,140.5625,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,0
1,102.507812,58.88243,0.465318,-0.515088,1.677258,14.860146,10.576487,127.39358,0
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
3,136.75,57.178449,-0.068415,-0.636238,3.642977,20.95928,6.896499,53.593661,0
4,88.726562,40.672225,0.600866,1.123492,1.17893,11.46872,14.269573,252.567306,0


In [79]:
"""
1. Mean of the integrated profile.
2. Standard deviation of the integrated profile.
3. Excess kurtosis of the integrated profile.
4. Skewness of the integrated profile.
5. Mean of the DM-SNR curve.
6. Standard deviation of the DM-SNR curve.
7. Excess kurtosis of the DM-SNR curve.
8. Skewness of the DM-SNR curve.
"""

#Adding Column Names
col_names = ["Mean_IP", "Std_IP", "Ex_Kurt_IP", "Skew_IP", "Mean_DM_SNR", "Std_DM_SNR", "Ex_Kurt_DM_SNR", "Skew_DM_SNR", "Class"]
data.columns = col_names
data.head()

Unnamed: 0,Mean_IP,Std_IP,Ex_Kurt_IP,Skew_IP,Mean_DM_SNR,Std_DM_SNR,Ex_Kurt_DM_SNR,Skew_DM_SNR,Class
0,140.5625,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,0
1,102.507812,58.88243,0.465318,-0.515088,1.677258,14.860146,10.576487,127.39358,0
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
3,136.75,57.178449,-0.068415,-0.636238,3.642977,20.95928,6.896499,53.593661,0
4,88.726562,40.672225,0.600866,1.123492,1.17893,11.46872,14.269573,252.567306,0


In [16]:
value_count = data["Class"].value_counts()
print(value_count)

0    16259
1     1639
Name: Class, dtype: int64


### Creating new Dataframe with equal number of values in Class Column

In [91]:
data_new = pd.concat((data.loc[data["Class"] == 0].sample(n= value_count[1]), data.loc[data["Class"] == 1]), ignore_index= True)

#shuffling the rows
data_new = sklearn.utils.shuffle(data_new, random_state= 50).reset_index(drop=True)
data_new.head(20)

Unnamed: 0,Mean_IP,Std_IP,Ex_Kurt_IP,Skew_IP,Mean_DM_SNR,Std_DM_SNR,Ex_Kurt_DM_SNR,Skew_DM_SNR,Class
0,93.75,40.657679,0.910555,2.16469,5.09699,28.217898,5.916694,35.542304,1
1,77.492188,36.010414,1.873517,6.508272,8.030936,37.252414,5.081888,25.683906,1
2,112.476562,58.971122,0.394276,-0.677985,3.253344,18.03875,8.181099,82.644551,0
3,81.65625,43.920105,1.320341,3.307269,13.639632,47.175894,3.604874,12.059129,1
4,110.054688,45.564053,0.379363,0.13388,1.261706,13.032154,13.294267,202.930061,0
5,115.929688,50.650176,0.102399,0.099721,1.397157,13.723267,12.605644,184.778788,0
6,94.695312,46.575134,0.33547,0.153277,3.459866,24.042832,7.285706,53.769442,1
7,49.015625,43.054287,3.142802,10.589181,12.489967,41.171808,3.778334,14.256524,1
8,106.71875,48.599919,0.483138,0.060912,6.269231,29.929925,6.026228,38.468864,1
9,25.132812,34.706666,4.815267,25.225663,91.404682,86.555964,0.390899,-1.237221,1


In [92]:
data_new["Class"].value_counts()

1    1639
0    1639
Name: Class, dtype: int64

### Writing new Dataframe to File

In [None]:
data_new.to_csv('HTRU_2_modified.csv', index=False)