In [1]:
import pandas as pd
import numpy as np
from markdown import MarkDownSeries

### Reading in our data

In [2]:
RawData = pd.read_csv('vgchartz-2024.csv')

### Using our function 

#### **How we will do it**



1. **Importing the Function**:
    * ```from data_preprocessing import preprocess_data```

        * This line imports the `preprocess_data` function from our file named `data_preprocessing.py`. 
        
        * By doing this, we can use the `preprocess_data` function in our current script or notebook without having to redefine it.



2. **Calling the Function**:

    * `preprocessed_RawData = preprocess_data(RawData)`

        * This line calls the `preprocess_data` function, passing `RawData` (which is our DataFrame) as an argument.

        * The `preprocess_data` function processes `RawData` according to the steps defined within the function, such as converting dates, dropping unnecessary columns, handling missing values, and converting strings to lowercase.

        * The processed DataFrame is then assigned to a new variable, `preprocessed_RawData`, which we can use for further analysis.


In [3]:
from data_preprocessing import preprocess_data

preprocessed_RawData = preprocess_data(RawData)

### **Summary**

- We imported a preprocessing function from an external file to keep our code organized and clean.

- We applied the preprocessing function to our data and stored the result in a new variable for further use.

In [4]:
preprocessed_RawData.head(1)

Unnamed: 0,title,console,genre,publisher,developer,critic_score,total_sales,na_sales,jp_sales,pal_sales,other_sales,release_date
0,Grand Theft Auto V,ps3,Action,Rockstar Games,Rockstar North,9.4,20.32,6.37,0.99,9.85,3.12,2013-09-17


### **Problem:**

We have the console names for the sales data, but we lack the manufacturer information. By conducting some research, we can determine the manufacturer for each console. To address this, we will create a dictionary to map each console to its respective manufacturer.


### **Mapping Console to Create Manufacturer Column:**

1. **Create a Dictionary for Mapping**:

    * Define a dictionary that maps each console to its respective manufacturer.


2. **Flatten into a Single List**:

    * Ensure that the dictionary keys are properly flattened into a single list for easy access and manipulation.


3. **Check for Missing Items**:

    * Identify any consoles in the data that are not included in the dictionary to ensure completeness.


4. <strong>Create Conditions and Values for <code>np.select</code></strong>:

    * Define conditions and corresponding values to use with the <code>np.select</code> function for assigning manufacturers.


5. <strong>Assign Console Manufacturers</strong>:

    * Use the mapping to create a new column in the DataFrame that lists the manufacturer for each console.


This approach ensures our dataset accurately reflects the relationship between consoles and their manufacturers, providing a complete view for analysis.

---

### We will start by making our empty dictionary 

In [5]:
categories = {
    'nintendo': [],
    'pc': [],
    'xbox': [],
    'sony': [],
    'mobile': [],
    'sega': [],
    'atari': [],
    'commodore': [],
    'other': []
}

In [6]:
preprocessed_RawData['console'].value_counts().head(3)

console
pc     10477
ps2     3511
ds      3166
Name: count, dtype: int64

### The finished Dictionary should look like this:

| manufacturer | console |
|---|---|
| atari | 2600, 7800, 5200, aj, int |
| commodore | amig, c64, cd32 |
| mobile | ios, and, winp, ngage, mob |
| nintendo | 3ds, dsiw, dsi, ds, wii, wiiu, ns, gb, gba, nes, snes, gbc, n64, vb, gc, vc, ww |
| other | ouya, or, acpc, ast, apii, pce, zxs, lynx, ng, zxs, 3do, pcfx, ws, brw, cv, giz, msx, tg16, bbcm |
| pc | linux, osx, pc, arc, all, fmt, c128, aco |
| sega | gg, msd, ms, gen, scd, sat, s32x, dc |
| sony | ps, ps2, ps3, ps4, ps5, psp, psv, psn, cdi |
| xbox | x360, xone, series, xbl, xb, xs |


In [7]:
# categories = {
#     'nintendo': ['3ds', 'dsiw', 'dsi', 'ds', 'wii', 'wiiu', 'ns', 'gb', 'gba', 'nes', 'snes', 'gbc', 'n64', 'vb', 'gc', 'vc','ww'],
#     'pc': ['linux', 'osx', 'pc', 'arc', 'all', 'fmt', 'c128', 'aco'],
#     'xbox': ['x360', 'xone', 'series', 'xbl', 'xb', 'xs'],
#     'sony': ['ps', 'ps2', 'ps3', 'ps4', 'ps5', 'psp', 'psv', 'psn', 'cdi'],
#     'mobile': ['ios', 'and', 'winp', 'ngage', 'mob'],
#     'sega': ['gg', 'msd', 'ms', 'gen', 'scd', 'sat', 's32x', 'dc'],
#     'atari': ['2600', '7800', '5200', 'aj', 'int'],
#     'commodore': ['amig', 'c64', 'cd32'],
#     'other': ['ouya', 'or', 'acpc', 'ast', 'apii', 'pce', 'zxs', 'lynx', 'ng', 'zxs', '3do', 'pcfx', 'ws', 'brw', 'cv', 'giz', 'msx', 'tg16', 'bbcm']
# }

# # Step 2: Flatten categories into a single list
# all_items = []
# for sublist in categories.values():
#     for item in sublist:
#         all_items.append(item)

# # Step 3: Check for missing items
# all_items_lower = [item.lower().strip() for item in all_items]
# unique_values_lower = set(RawData['console'].str.lower().str.strip().unique())
# missing_items = set(all_items_lower) - unique_values_lower

# if missing_items:
#     print(f"Missing items: {missing_items}")
# else:
#     print("All items are covered.")

# # Step 4: Create conditions and values for np.select
# conditions = []
# for items in categories.values():
#     conditions.append(RawData['console'].isin(items))

# values = list(categories.keys())

# # Step 5: Assign console manufacturers
# RawData['console_mfg'] = np.select(conditions, values, default='unknown')

In [8]:
# RawData['console_mfg'].value_counts()

In [9]:
missing_consoles = RawData[RawData['console_mfg'] == "unknown"]['console']
print("Consoles with unknown manufacturers:")
missing_consoles.value_counts()

KeyError: 'console_mfg'

In [None]:
ConsoleToQuery = 'pc'
QueryResult = RawData[RawData['console'] == ConsoleToQuery]
print(f"Rows where console = '{ConsoleToQuery}':")
QueryResult.head(20)

Rows where console = 'pc':


Unnamed: 0,img,title,console,genre,publisher,developer,critic_score,total_sales,na_sales,jp_sales,pal_sales,other_sales,release_date,last_update


In [None]:
GameToQuery = 'The Great Giana Sisters'
GameQueryResult = RawData[RawData['title'] == GameToQuery]
GameQueryResult.head(20)

Unnamed: 0,img,title,console,genre,publisher,developer,critic_score,total_sales,na_sales,jp_sales,pal_sales,other_sales,release_date,last_update,console_mfg
38407,/games/boxart/default.jpg,The Great Giana Sisters,MSX,Misc,Rainbow Arts,Unknown,,,,,,,1987-01-01,,unknown
38408,/games/boxart/default.jpg,The Great Giana Sisters,AST,Misc,Rainbow Arts,Unknown,,,,,,,1987-01-01,,unknown
38409,/games/boxart/default.jpg,The Great Giana Sisters,Amig,Misc,Rainbow Arts,Unknown,,,,,,,1987-01-01,,unknown
38410,/games/boxart/default.jpg,The Great Giana Sisters,C64,Misc,Rainbow Arts,Unknown,,,,,,,1987-01-01,,unknown
38411,/games/boxart/default.jpg,The Great Giana Sisters,BRW,Misc,Rainbow Arts,Unknown,,,,,,,1987-01-01,,unknown
41938,/games/boxart/full_1949814AmericaFrontccc.png,The Great Giana Sisters,ACPC,Platform,Rainbow Arts,Time Warp Productions,,,,,,,1987-01-01,2018-01-06,unknown


In [None]:
categoriesList = []
for manufacturer, consoles in categories.items():
    for console in consoles:
        categoriesList.append({'manufacturer': manufacturer, 'console': console})

# Converting the list to a DataFrame
mfg_list = pd.DataFrame(categoriesList)

# Grouping by 'manufacturer' and aggregating consoles into lists
grouped_series = mfg_list.groupby('manufacturer')['console'].apply(list)

# Converting the grouped Series to Markdown
markdown_table = MarkDownSeries(grouped_series)

print(markdown_table)

| manufacturer | console |
|---|---|
| atari | 2600, 7800, 5200, aj, int |
| commodore | amig, c64, cd32 |
| mobile | ios, and, winp, ngage, mob |
| nintendo | 3ds, dsiw, dsi, ds, wii, wiiu, ns, gb, gba, nes, snes, gbc, n64, vb, gc, vc, ww |
| other | ouya, or, acpc, ast, apii, pce, zxs, lynx, ng, zxs, 3do, pcfx, ws, brw, cv, giz, msx, tg16, bbcm |
| pc | linux, osx, pc, arc, all, fmt, c128, aco |
| sega | gg, msd, ms, gen, scd, sat, s32x, dc |
| sony | ps, ps2, ps3, ps4, ps5, psp, psv, psn, cdi |
| xbox | x360, xone, series, xbl, xb, xs |



| manufacturer | console |
|---|---|
| atari | 2600, 7800, 5200, aj, int |
| commodore | amig, c64, cd32 |
| mobile | ios, and, winp, ngage, mob |
| nintendo | 3ds, dsiw, dsi, ds, wii, wiiu, ns, gb, gba, nes, snes, gbc, n64, vb, gc, vc, ww |
| other | ouya, or, acpc, ast, apii, pce, zxs, lynx, ng, zxs, 3do, pcfx, ws, brw, cv, giz, msx, tg16, bbcm |
| pc | linux, osx, pc, arc, all, fmt, c128, aco |
| sega | gg, msd, ms, gen, scd, sat, s32x, dc |
| sony | ps, ps2, ps3, ps4, ps5, psp, psv, psn, cdi |
| xbox | x360, xone, series, xbl, xb, xs |
