In [22]:
import pandas as pd
import numpy as np
from markdown import MarkDownSeries

### Reading in our data

In [23]:
RawData = pd.read_csv('vgchartz-2024.csv')

### Using our function 

#### **How we will do it**



1. **Importing the Function**:
    * ```from data_preprocessing import preprocess_data```

        * This line imports the `preprocess_data` function from our file named `data_preprocessing.py`. 
        
        * By doing this, we can use the `preprocess_data` function in our current script or notebook without having to redefine it.



2. **Calling the Function**:

    * `preprocessed_RawData = preprocess_data(RawData)`

        * This line calls the `preprocess_data` function, passing `RawData` (which is our DataFrame) as an argument.

        * The `preprocess_data` function processes `RawData` according to the steps defined within the function, such as converting dates, dropping unnecessary columns, handling missing values, and converting strings to lowercase.

        * The processed DataFrame is then assigned to a new variable, `preprocessed_RawData`, which we can use for further analysis.


In [24]:
from data_preprocessing import preprocess_data

preprocessed_RawData = preprocess_data(RawData)

In [25]:
def cleaning(df):
    categories = {
        'nintendo': ['3ds', 'dsiw', 'dsi', 'ds', 'wii', 'wiiu', 'ns', 'gb', 'gba', 'nes', 'snes', 'gbc', 'n64', 'vb', 'gc', 'vc','ww'],
        'pc': ['linux', 'osx', 'pc', 'arc', 'all', 'fmt', 'c128', 'aco'],
        'xbox': ['x360', 'xone', 'series', 'xbl', 'xb', 'xs'],
        'sony': ['ps', 'ps2', 'ps3', 'ps4', 'ps5', 'psp', 'psv', 'psn', 'cdi'],
        'mobile': ['ios', 'and', 'winp', 'ngage', 'mob'],
        'sega': ['gg', 'msd', 'ms', 'gen', 'scd', 'sat', 's32x', 'dc'],
        'atari': ['2600', '7800', '5200', 'aj', 'int'],
        'commodore': ['amig', 'c64', 'cd32'],
        'other': ['ouya', 'or', 'acpc', 'ast', 'apii', 'pce', 'zxs', 'lynx', 'ng', 'zxs', '3do', 'pcfx', 'ws', 'brw', 'cv', 'giz', 'msx', 'tg16', 'bbcm']
    }

    # Step 2: Flatten categories into a single list
    all_items = []
    for sublist in categories.values():
        for item in sublist:
            all_items.append(item)

    # Step 3: Check for missing items
    all_items_lower = [item.lower().strip() for item in all_items]
    unique_values_lower = set(preprocessed_RawData['console'].str.lower().str.strip().unique())
    missing_items = set(all_items_lower) - unique_values_lower

    if missing_items:
        print(f"Missing items: {missing_items}")
    else:
        print("All items are covered.")

    # Step 4: Create conditions and values for np.select
    conditions = []
    for items in categories.values():
        conditions.append(preprocessed_RawData['console'].isin(items))

    values = list(categories.keys())

    # Step 5: Assign console manufacturers
    preprocessed_RawData['console_mfg'] = np.select(conditions, values, default='unknown')

    return df

In [28]:
cleaned_data = cleaning(preprocessed_RawData)
cleaned_data.head()

All items are covered.


Unnamed: 0,title,console,genre,publisher,developer,critic_score,total_sales,na_sales,jp_sales,pal_sales,other_sales,release_date,console_mfg
0,Grand Theft Auto V,ps3,Action,Rockstar Games,Rockstar North,9.4,20.32,6.37,0.99,9.85,3.12,2013-09-17,sony
1,Grand Theft Auto V,ps4,Action,Rockstar Games,Rockstar North,9.7,19.39,6.06,0.6,9.71,3.02,2014-11-18,sony
2,Grand Theft Auto: Vice City,ps2,Action,Rockstar Games,Rockstar North,9.6,16.15,8.41,0.47,5.49,1.78,2002-10-28,sony
3,Grand Theft Auto V,x360,Action,Rockstar Games,Rockstar North,0.0,15.86,9.06,0.06,5.33,1.42,2013-09-17,xbox
4,Call of Duty: Black Ops 3,ps4,Shooter,Activision,Treyarch,8.1,15.09,6.18,0.41,6.05,2.44,2015-11-06,sony


In [26]:
# RawData['console_mfg'].value_counts()

In [27]:
missing_consoles = preprocessed_RawData[preprocessed_RawData['console_mfg'] == "unknown"]['console']
print("Consoles with unknown manufacturers:")
missing_consoles.value_counts()

KeyError: 'console_mfg'

In [None]:
ConsoleToQuery = 'pc'
QueryResult = preprocessed_RawData[preprocessed_RawData['console'] == ConsoleToQuery]
print(f"Rows where console = '{ConsoleToQuery}':")
QueryResult.head(20)

Rows where console = 'pc':


Unnamed: 0,title,console,genre,publisher,developer,critic_score,total_sales,na_sales,jp_sales,pal_sales,other_sales,release_date,console_mfg
34,The Sims 3,pc,Simulation,Electronic Arts,EA Redwood Shores,8.5,7.96,1.01,0.0,6.46,0.5,2009-06-02,pc
79,Microsoft Flight Simulator,pc,Simulation,Microsoft,Microsoft,7.0,5.12,3.22,0.0,1.69,0.2,1996-10-31,pc
144,The Sims 4,pc,Simulation,Electronic Arts,Maxis,7.0,4.1,1.2,0.0,2.6,0.3,2014-09-02,pc
154,The Elder Scrolls V: Skyrim,pc,Role-Playing,Bethesda Softworks,Bethesda Game Studios,9.2,3.99,1.18,0.0,2.25,0.56,2011-11-11,pc
166,The Sims: Unleashed,pc,Simulation,Electronic Arts,Maxis,7.3,3.76,2.03,0.0,1.56,0.17,2002-09-23,pc
181,Doom II,pc,Shooter,GT Interactive,id Software,9.3,3.61,2.05,0.0,1.4,0.16,1994-09-30,pc
253,The Sims: Vacation,pc,Simulation,Electronic Arts,Maxis,0.0,3.07,1.72,0.0,1.21,0.14,2002-03-25,pc
262,The Sims: Livin Large,pc,Misc,Electronic Arts,Maxis,0.0,2.99,1.67,0.0,1.18,0.13,2000-08-27,pc
304,Battlefield 3,pc,Shooter,Electronic Arts,Dice,8.9,2.76,0.89,0.0,1.44,0.43,2011-10-25,pc
388,Theme Hospital,pc,Strategy,Electronic Arts,Bullfrog Productions,5.5,2.4,2.3,0.0,0.1,0.0,1997-03-31,pc


In [None]:
GameToQuery = 'The Great Giana Sisters'
GameQueryResult = preprocessed_RawData[preprocessed_RawData['title'] == GameToQuery]
GameQueryResult.head(20)

Unnamed: 0,title,console,genre,publisher,developer,critic_score,total_sales,na_sales,jp_sales,pal_sales,other_sales,release_date,console_mfg
38407,The Great Giana Sisters,msx,Misc,Rainbow Arts,Unknown,0.0,0.0,0.0,0.0,0.0,0.0,1987-01-01,other
38408,The Great Giana Sisters,ast,Misc,Rainbow Arts,Unknown,0.0,0.0,0.0,0.0,0.0,0.0,1987-01-01,other
38409,The Great Giana Sisters,amig,Misc,Rainbow Arts,Unknown,0.0,0.0,0.0,0.0,0.0,0.0,1987-01-01,commodore
38410,The Great Giana Sisters,c64,Misc,Rainbow Arts,Unknown,0.0,0.0,0.0,0.0,0.0,0.0,1987-01-01,commodore
38411,The Great Giana Sisters,brw,Misc,Rainbow Arts,Unknown,0.0,0.0,0.0,0.0,0.0,0.0,1987-01-01,other
41938,The Great Giana Sisters,acpc,Platform,Rainbow Arts,Time Warp Productions,0.0,0.0,0.0,0.0,0.0,0.0,1987-01-01,other


In [None]:
categoriesList = []
for manufacturer, consoles in categories.items():
    for console in consoles:
        categoriesList.append({'manufacturer': manufacturer, 'console': console})

# Converting the list to a DataFrame
mfg_list = pd.DataFrame(categoriesList)

# Grouping by 'manufacturer' and aggregating consoles into lists
grouped_series = mfg_list.groupby('manufacturer')['console'].apply(list)

# Converting the grouped Series to Markdown
markdown_table = MarkDownSeries(grouped_series)

print(markdown_table)

| manufacturer | console |
|---|---|
| atari | 2600, 7800, 5200, aj, int |
| commodore | amig, c64, cd32 |
| mobile | ios, and, winp, ngage, mob |
| nintendo | 3ds, dsiw, dsi, ds, wii, wiiu, ns, gb, gba, nes, snes, gbc, n64, vb, gc, vc, ww |
| other | ouya, or, acpc, ast, apii, pce, zxs, lynx, ng, zxs, 3do, pcfx, ws, brw, cv, giz, msx, tg16, bbcm |
| pc | linux, osx, pc, arc, all, fmt, c128, aco |
| sega | gg, msd, ms, gen, scd, sat, s32x, dc |
| sony | ps, ps2, ps3, ps4, ps5, psp, psv, psn, cdi |
| xbox | x360, xone, series, xbl, xb, xs |



| manufacturer | console |
|---|---|
| atari | 2600, 7800, 5200, aj, int |
| commodore | amig, c64, cd32 |
| mobile | ios, and, winp, ngage, mob |
| nintendo | 3ds, dsiw, dsi, ds, wii, wiiu, ns, gb, gba, nes, snes, gbc, n64, vb, gc, vc, ww |
| other | ouya, or, acpc, ast, apii, pce, zxs, lynx, ng, zxs, 3do, pcfx, ws, brw, cv, giz, msx, tg16, bbcm |
| pc | linux, osx, pc, arc, all, fmt, c128, aco |
| sega | gg, msd, ms, gen, scd, sat, s32x, dc |
| sony | ps, ps2, ps3, ps4, ps5, psp, psv, psn, cdi |
| xbox | x360, xone, series, xbl, xb, xs |
