In [160]:
import pandas as pd
import numpy as np
import string
import re
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [163]:
#Replace dirty values in the column with nan
def replace_dirtydata_nan(data_col, nan_items):
    replacements = dict.fromkeys(nan_items, np.nan)
    data_col = data_col.replace(replacements)
    return data_col

#make the column values boolean (yes/no)
def rep_with_boolean(data_col):
    conditions = ((data_col == "No") | data_col.isna())
    data_col = pd.Series(np.where(conditions, data_col, "Yes"))
    return data_col

#extract first occurrence of numbers in the string
def extract_mAh(data):
    res = data.str.extract(r"(\b\d+)")[0]
    return res.astype(float)

#convert battery data in Wh to mAh to make the data consistent
def convert_Wh_to_mAh(data):
    Wh = extract_mAh(data)
    return round(Wh*1000/3.7)

#MAIN DATA CLEANING FUNCTIONS

#AUDIO
def clean_audiojack_data(audio_jack):
    audio_jack = pd.Series(audio_jack.str.split(r'[\s"|"]').str[0])
    audio_jack = audio_jack.replace("TBD", np.nan)
    return audio_jack

#GPS
def clean_GPS_data(GPS):
#     replacements = dict.fromkeys(["To be confirmed", "TBD"], np.nan)
#     GPS = GPS.replace(replacements)
#     conditions = ((GPS == "No") | GPS.isna())
#     GPS = pd.Series(np.where(conditions, GPS, "Yes"))

    nan_items = ["To be confirmed", "TBD"]
    GPS = replace_dirtydata_nan(GPS, nan_items)
    GPS = rep_with_boolean(GPS)
    return GPS

#NFC
def clean_NFC_data(NFC):
#     replacements = dict.fromkeys(["TBC", "TBD", "To be confirmed"], np.nan)
#     NFC = NFC.replace(replacements)
#     conditions = ((NFC == "No") | NFC.isna())
#     NFC = pd.Series(np.where(conditions, NFC, "Yes"))
    
    nan_items = ["TBC", "TBD", "To be confirmed"]
    NFC = replace_dirtydata_nan(NFC, nan_items)
    NFC = rep_with_boolean(NFC)
    return NFC

#RADIO
def clean_radio_data(radio):
#     replacements = dict.fromkeys(["TBC", "TBD", "To be confirmed", "N&#1086"], np.nan)
#     radio = radio.replace(replacements)
#     conditions = ((radio == "No") | radio.isna())
#     radio = pd.Series(np.where(conditions, radio, "Yes"))
    
    nan_items = ["TBC", "TBD", "To be confirmed", "N&#1086"]
    radio = replace_dirtydata_nan(radio, nan_items)
    radio = rep_with_boolean(radio)
    return radio

#BATTERY
def clean_battery_data(battery):
    battery = pd.Series(np.where(battery.str.contains("mAh"), extract_mAh(battery), convert_Wh_to_mAh(battery)))
    return battery



In [164]:
if __name__ == "__main__":  
    data = pd.read_csv("phones_refined.csv", header = 0)
    # data

    audio_jack = clean_audiojack_data(data["audio_jack"])
    GPS = clean_GPS_data(data["GPS"])
    NFC = clean_NFC_data(data["NFC"])
    radio = clean_radio_data(data["radio"])
    battery = clean_battery_data(data["battery"])
    
    pd.concat({"audio_jack": audio_jack, "GPS": GPS, "NFC":  NFC, "radio": radio, "battery": battery}, axis = 1).to_csv("alisha_refined_phonedata.csv")
