Reading CSV Files with Encodings

In [1]:
import pandas as pd

laptops = pd.read_csv("laptops.csv", encoding="Latin-1")
laptops.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 13 columns):
Manufacturer                1303 non-null object
Model Name                  1303 non-null object
Category                    1303 non-null object
Screen Size                 1303 non-null object
Screen                      1303 non-null object
CPU                         1303 non-null object
RAM                         1303 non-null object
 Storage                    1303 non-null object
GPU                         1303 non-null object
Operating System            1303 non-null object
Operating System Version    1133 non-null object
Weight                      1303 non-null object
Price (Euros)               1303 non-null object
dtypes: object(13)
memory usage: 132.5+ KB


Cleaning Column Names

In [2]:
new_columns = []

for columns in laptops.columns:
    t_columns = columns.strip()
    print("EUM: ", t_columns)
    new_columns.append(t_columns)
    
laptops.columns = new_columns

########################################

def formatting(column):
    column = column.strip()
    column = column.replace("Operating System", "os")
    column = column.replace(" ", "_")
    column = column.replace("(", "")
    column = column.replace(")", "")
    column = column.lower()
    return column

t_columns = []
for column in laptops.columns:
    formatted_column = formatting(column)
    t_columns.append(formatted_column)
    
laptops.columns = t_columns

EUM:  Manufacturer
EUM:  Model Name
EUM:  Category
EUM:  Screen Size
EUM:  Screen
EUM:  CPU
EUM:  RAM
EUM:  Storage
EUM:  GPU
EUM:  Operating System
EUM:  Operating System Version
EUM:  Weight
EUM:  Price (Euros)


Cleaning String Columns to Numeric

In [3]:
unique_ram = laptops["ram"].unique()

Removing Non-Digit Characters

In [4]:
laptops["ram"] = laptops["ram"].str.replace("GB", "")
unique_ram = laptops["ram"].unique()

Converting Columns to Numeric Dtypes

In [5]:
laptops["ram"] = laptops["ram"].astype(int)
dtypes = laptops.dtypes

Renaming Columns

In [6]:
laptops.rename({"ram" : "ram_gb"}, axis=1, inplace=True)
ram_gb_desc = laptops["ram_gb"].describe()

Extracting Values from Strings

In [7]:
laptops["gpu_manufacturer"] = (laptops["gpu"]
                                       .str.split()
                                       .str[0]
                              )

laptops["cpu_manufacturer"] = laptops["cpu"].str.split().str[0]
cpu_manufacturer_counts = laptops["cpu_manufacturer"].value_counts()

Correcting Bad Values

In [8]:
mapping_dict = {
    'Android': 'Android',
    'Chrome OS': 'Chrome OS',
    'Linux': 'Linux',
    'Mac OS': 'macOS',
    'No OS': 'No OS',
    'Windows': 'Windows',
    'macOS': 'macOS'
}

laptops["os"] = laptops["os"].map(mapping_dict)

Dropping Missing Values

In [9]:
laptops_no_null_rows = laptops.dropna(axis=0)
laptops_no_null_cols = laptops.dropna(axis=1)

Filling Missing Values

In [10]:
value_counts_before = laptops.loc[laptops["os_version"].isnull(), "os"].value_counts()
laptops.loc[laptops["os"] == "macOS", "os_version"] = "X"

laptops.loc[laptops["os"] == "No OS", "os_version"] = "Version Unknown"
value_counts_after = laptops.loc[laptops["os_version"].isnull(), "os"].value_counts()

Clean a String Column

In [11]:
laptops["weight"] = laptops["weight"].str.replace("kgs","").str.replace("kg","").astype(float)
laptops.rename({"weight": "weight_kg"}, axis=1, inplace=True)
laptops.to_csv('laptops_cleaned.csv',index=False)