# Data Preprocessing

### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

In [2]:
warnings.filterwarnings('ignore')
CSV_PATH = "../data/raw/Week1_challenge_data_source.csv"

In [3]:
# Reading a dataframe

def read_proccessed_data(csv_path):
    try:    
        df = pd.read_csv(csv_path)
        return df
    except FileNotFoundError:
        print("file not found")

In [4]:
# getting number of rows and columns information

def get_data_info(df: pd.DataFrame):
    
    row_count, col_count = df.shape
    
    print(f"Number of rows: {row_count}")
    print(f"Number of columns: {col_count}")

    return df.info()

In [5]:
# basic statistics like mean, std and percentiles

def get_statistics_info(df: pd.DataFrame):
    
    return df.describe(include='all')

In [6]:
# reading the telecom data and getting information

xDR_df = read_proccessed_data(CSV_PATH)
get_data_info(xDR_df)
get_statistics_info(xDR_df)

Number of rows: 150001
Number of columns: 55
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150001 entries, 0 to 150000
Data columns (total 55 columns):
 #   Column                                    Non-Null Count   Dtype  
---  ------                                    --------------   -----  
 0   Bearer Id                                 149010 non-null  float64
 1   Start                                     150000 non-null  object 
 2   Start ms                                  150000 non-null  float64
 3   End                                       150000 non-null  object 
 4   End ms                                    150000 non-null  float64
 5   Dur. (ms)                                 150000 non-null  float64
 6   IMSI                                      149431 non-null  float64
 7   MSISDN/Number                             148935 non-null  float64
 8   IMEI                                      149429 non-null  float64
 9   Last Location Name                        14884

Unnamed: 0,Bearer Id,Start,Start ms,End,End ms,Dur. (ms),IMSI,MSISDN/Number,IMEI,Last Location Name,...,Youtube DL (Bytes),Youtube UL (Bytes),Netflix DL (Bytes),Netflix UL (Bytes),Gaming DL (Bytes),Gaming UL (Bytes),Other DL (Bytes),Other UL (Bytes),Total UL (Bytes),Total DL (Bytes)
count,149010.0,150000,150000.0,150000,150000.0,150000.0,149431.0,148935.0,149429.0,148848.0,...,150001.0,150001.0,150001.0,150001.0,150001.0,150001.0,150001.0,150001.0,150000.0,150000.0
unique,,9997,,6403,,,,,,45036.0,...,,,,,,,,,,
top,,4/26/2019 7:25,,4/25/2019 0:01,,,,,,9160000000000000.0,...,,,,,,,,,,
freq,,203,,1150,,,,,,1881.0,...,,,,,,,,,,
mean,1.012554e+19,,499.1882,,498.80088,104608.6,208000000000000.0,41880170000.0,48474270000000.0,,...,11634070.0,11009410.0,11626850.0,11001750.0,422044700.0,8288398.0,421100500.0,8264799.0,41121210.0,454643400.0
std,2.878585e+18,,288.611834,,288.097653,81037.62,21332190000.0,2446482000000.0,22421000000000.0,,...,6710569.0,6345423.0,6725218.0,6359490.0,243967500.0,4782700.0,243205000.0,4769004.0,11276390.0,244142900.0
min,6.92e+18,,0.0,,0.0,7142.0,204000000000000.0,33601000000.0,440000000000.0,,...,53.0,105.0,42.0,35.0,2516.0,59.0,3290.0,148.0,2866892.0,7114041.0
25%,7.35e+18,,250.0,,251.0,57440.5,208000000000000.0,33651300000.0,35500000000000.0,,...,5833501.0,5517965.0,5777156.0,5475981.0,210473300.0,4128476.0,210186900.0,4145943.0,33222010.0,243106800.0
50%,7.35e+18,,499.0,,500.0,86399.0,208000000000000.0,33663710000.0,35700000000000.0,,...,11616020.0,11013450.0,11642220.0,10996380.0,423408100.0,8291208.0,421803000.0,8267071.0,41143310.0,455841100.0
75%,1.3e+19,,749.0,,750.0,132430.2,208000000000000.0,33683490000.0,86100000000000.0,,...,17448520.0,16515560.0,17470480.0,16507270.0,633174200.0,12431620.0,631691800.0,12384150.0,49034240.0,665705500.0


## Handling Missing Values

In [8]:
def percent_missing(df):

    totalCells = np.product(df.shape)
    missingCount = df.isnull().sum()
    totalMissing = missingCount.sum()
    return round((totalMissing / totalCells) * 100, 2)

In [9]:
print("The Telecom dataset contains", percent_missing(xDR_df), "%", "missing values.")

The Telecom dataset contains 12.5 % missing values.
