 # Extract - Transform - Load 
 This notebook contains the ETL for three raw datasets in .json.gzip format. Due to the large volume of the datasets, this process aims to clear the data and make it understandable, and to choose the relevant variables for the endpoints functions and ML  recommendation model. 

In [1]:
#import libraries to use
import pandas as pd
import json
import gzip
import ast

import pyarrow as pa
import pyarrow.parquet as pq

import warnings
warnings.filterwarnings("ignore")

# steam_games.json.gz

### Extract



In [2]:
# This is a gzipped JSON file with a JSON array of objects
# Open file with UTF-8 coding using gzip.
with gzip.open('../data/raw/steam_games.json.gz', 'rt', encoding='utf-8') as steam_games:
    
    # Read JSON file line by line and create a DataFrame.
    df_games = pd.read_json(steam_games, lines=True)

In [3]:
df_games

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
0,,,,,,,,,,,,,
1,,,,,,,,,,,,,
2,,,,,,,,,,,,,
3,,,,,,,,,,,,,
4,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
120440,Ghost_RUS Games,"[Casual, Indie, Simulation, Strategy]",Colony On Mars,Colony On Mars,http://store.steampowered.com/app/773640/Colon...,2018-01-04,"[Strategy, Indie, Casual, Simulation]",http://steamcommunity.com/app/773640/reviews/?...,"[Single-player, Steam Achievements]",1.99,0.0,773640.0,"Nikita ""Ghost_RUS"""
120441,Sacada,"[Casual, Indie, Strategy]",LOGistICAL: South Africa,LOGistICAL: South Africa,http://store.steampowered.com/app/733530/LOGis...,2018-01-04,"[Strategy, Indie, Casual]",http://steamcommunity.com/app/733530/reviews/?...,"[Single-player, Steam Achievements, Steam Clou...",4.99,0.0,733530.0,Sacada
120442,Laush Studio,"[Indie, Racing, Simulation]",Russian Roads,Russian Roads,http://store.steampowered.com/app/610660/Russi...,2018-01-04,"[Indie, Simulation, Racing]",http://steamcommunity.com/app/610660/reviews/?...,"[Single-player, Steam Achievements, Steam Trad...",1.99,0.0,610660.0,Laush Dmitriy Sergeevich
120443,SIXNAILS,"[Casual, Indie]",EXIT 2 - Directions,EXIT 2 - Directions,http://store.steampowered.com/app/658870/EXIT_...,2017-09-02,"[Indie, Casual, Puzzle, Singleplayer, Atmosphe...",http://steamcommunity.com/app/658870/reviews/?...,"[Single-player, Steam Achievements, Steam Cloud]",4.99,0.0,658870.0,"xropi,stev3ns"


### Transform

#### drop nulls and unneeded columns

In [4]:
#drop rows with all null values
df_games = df_games.dropna(how='all').reset_index(drop=True)
df_games.shape

(32135, 13)

In [5]:
# Count nulls
df_games.isnull().sum()

publisher       8052
genres          3283
app_name           2
title           2050
url                0
release_date    2067
tags             163
reviews_url        2
specs            670
price           1377
early_access       0
id                 2
developer       3299
dtype: int64

In [6]:
# Drop columns
df_games.drop(columns=['publisher','specs','early_access', 'reviews_url','app_name', 'url', 'tags' ], axis=1, inplace=True)

In [7]:
# Drop nulls in all remaining columns
df_games.dropna(inplace=True)

In [8]:
# Count nulls again
df_games.isnull().sum()

genres          0
title           0
release_date    0
price           0
id              0
developer       0
dtype: int64

#### drop duplicates

In [9]:
# This ensures that all sequences in the DataFrame are immutable, since tuples are immutable while lists are not

for col in df_games.columns:
    df_games[col] = df_games[col].apply(lambda x: tuple(x) if isinstance(x, list) else x)

In [10]:
#count duplicates
num_duplicates = df_games.duplicated().sum()
print(f"Total number of duplicate rows: {num_duplicates}")


Total number of duplicate rows: 1


In [11]:
index_list = [14573]
df_games = df_games.drop(index_list)

In [12]:
#count duplicates
df_games.duplicated().sum()

0

#### transform column 'genre'

In [13]:
df_games['genres'][1]

('Free to Play', 'Indie', 'RPG', 'Strategy')

In [14]:
# Explode column 'genres'
df_games = df_games.explode('genres')
df_games = df_games.dropna(subset=['genres'])
df_games.head()

Unnamed: 0,genres,title,release_date,price,id,developer
0,Action,Lost Summoner Kitty,2018-01-04,4.99,761140.0,Kotoshiro
0,Casual,Lost Summoner Kitty,2018-01-04,4.99,761140.0,Kotoshiro
0,Indie,Lost Summoner Kitty,2018-01-04,4.99,761140.0,Kotoshiro
0,Simulation,Lost Summoner Kitty,2018-01-04,4.99,761140.0,Kotoshiro
0,Strategy,Lost Summoner Kitty,2018-01-04,4.99,761140.0,Kotoshiro


In [15]:
df_games['genres'].unique()

array(['Action', 'Casual', 'Indie', 'Simulation', 'Strategy',
       'Free to Play', 'RPG', 'Sports', 'Adventure', 'Racing',
       'Early Access', 'Massively Multiplayer',
       'Animation &amp; Modeling', 'Web Publishing', 'Education',
       'Software Training', 'Utilities', 'Design &amp; Illustration',
       'Audio Production', 'Video Production', 'Photo Editing',
       'Accounting'], dtype=object)

In [16]:
df_games['genres'][100]

'Strategy'

#### transform column price

In [17]:
# list unique values in column 'price'
df_games['price'].unique()

array([4.99, 'Free To Play', 'Free to Play', 0.99, 3.99, 9.99, 18.99,
       29.99, 10.99, 2.99, 1.5899999999999999, 14.99, 1.99, 59.99, 8.99,
       6.99, 7.99, 39.99, 'Free', 19.99, 7.49, 12.99, 5.99, 2.49, 15.99,
       1.25, 24.99, 17.99, 61.99, 3.49, 11.99, 13.99, 'Free Demo',
       'Play for Free!', 34.99, 1.49, 32.99, 99.99, 14.95, 69.99, 16.99,
       79.99, 49.99, 5.0, 13.98, 29.96, 109.99, 149.99, 771.71,
       'Install Now', 21.99, 89.99, 'Play WARMACHINE: Tactics Demo', 0.98,
       139.92, 4.29, 'Free Mod', 54.99, 64.99, 74.99, 'Install Theme',
       0.89, 'Third-party', 0.5, 'Play Now', 299.99, 1.29, 119.99, 44.99,
       3.0, 15.0, 5.49, 23.99, 49.0, 10.93, 1.3900000000000001,
       'Free HITMAN™ Holiday Pack', 36.99, 4.49, 2.0, 4.0,
       1.9500000000000002, 1.5, 199.0, 189.0, 6.66, 27.99, 129.99, 179.0,
       26.99, 399.99, 31.99, 399.0, 20.0, 40.0, 3.33, 22.99, 320.0, 38.85,
       71.7, 995.0, 27.49, 3.39, 6.0, 19.95, 20.99, 499.99, 199.99, 16.06,
       4.68, 

In [18]:
# Normalize and change data type to numeric in column 'price'
df_games['price'] = pd.to_numeric(df_games['price'], errors='coerce').astype('float64').round(2)
df_games = df_games.fillna({'price': 0})

In [19]:
df_games['price'].unique()

array([4.9900e+00, 0.0000e+00, 9.9000e-01, 3.9900e+00, 9.9900e+00,
       1.8990e+01, 2.9990e+01, 1.0990e+01, 2.9900e+00, 1.5900e+00,
       1.4990e+01, 1.9900e+00, 5.9990e+01, 8.9900e+00, 6.9900e+00,
       7.9900e+00, 3.9990e+01, 1.9990e+01, 7.4900e+00, 1.2990e+01,
       5.9900e+00, 2.4900e+00, 1.5990e+01, 1.2500e+00, 2.4990e+01,
       1.7990e+01, 6.1990e+01, 3.4900e+00, 1.1990e+01, 1.3990e+01,
       3.4990e+01, 1.4900e+00, 3.2990e+01, 9.9990e+01, 1.4950e+01,
       6.9990e+01, 1.6990e+01, 7.9990e+01, 4.9990e+01, 5.0000e+00,
       1.3980e+01, 2.9960e+01, 1.0999e+02, 1.4999e+02, 7.7171e+02,
       2.1990e+01, 8.9990e+01, 9.8000e-01, 1.3992e+02, 4.2900e+00,
       5.4990e+01, 6.4990e+01, 7.4990e+01, 8.9000e-01, 5.0000e-01,
       2.9999e+02, 1.2900e+00, 1.1999e+02, 4.4990e+01, 3.0000e+00,
       1.5000e+01, 5.4900e+00, 2.3990e+01, 4.9000e+01, 1.0930e+01,
       1.3900e+00, 3.6990e+01, 4.4900e+00, 2.0000e+00, 4.0000e+00,
       1.9500e+00, 1.5000e+00, 1.9900e+02, 1.8900e+02, 6.6600e

#### transform column 'release_date'

In [20]:
# our endpoint functions require only the year of release, so we drop month and day
df_games['release_date'] = pd.to_datetime(df_games['release_date'], errors='coerce')
df_games['release_date'] = df_games['release_date'].dt.year
df_games.dropna(subset=['release_date'], inplace=True)
# Rename column 'release_date' to 'release_year'
df_games = df_games.rename(columns={'release_date': 'release_year'})
df_games = df_games.astype({'release_year': 'int'})

In [21]:
#reset index
df_games.reset_index(inplace=True)
df_games.drop('index',axis=1,inplace= True)

### Load

In [22]:
#show resulting dataset
df_games

Unnamed: 0,genres,title,release_year,price,id,developer
0,Action,Lost Summoner Kitty,2018,4.99,761140.0,Kotoshiro
1,Casual,Lost Summoner Kitty,2018,4.99,761140.0,Kotoshiro
2,Indie,Lost Summoner Kitty,2018,4.99,761140.0,Kotoshiro
3,Simulation,Lost Summoner Kitty,2018,4.99,761140.0,Kotoshiro
4,Strategy,Lost Summoner Kitty,2018,4.99,761140.0,Kotoshiro
...,...,...,...,...,...,...
67926,Indie,Russian Roads,2018,1.99,610660.0,Laush Dmitriy Sergeevich
67927,Racing,Russian Roads,2018,1.99,610660.0,Laush Dmitriy Sergeevich
67928,Simulation,Russian Roads,2018,1.99,610660.0,Laush Dmitriy Sergeevich
67929,Casual,EXIT 2 - Directions,2017,4.99,658870.0,"xropi,stev3ns"


In [23]:
df_games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67931 entries, 0 to 67930
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   genres        67931 non-null  object 
 1   title         67931 non-null  object 
 2   release_year  67931 non-null  int64  
 3   price         67931 non-null  float64
 4   id            67931 non-null  float64
 5   developer     67931 non-null  object 
dtypes: float64(2), int64(1), object(3)
memory usage: 3.1+ MB


In [24]:
#load a new processed dataset for EDA
df_games.to_csv('../data/processed/steam_games_processed.csv',  index=False)

In [26]:
pq.write_table(pa.Table.from_pandas(df_games), '../data/df_games.parquet')

# user_reviews.json.gz

### Extract

In [28]:
# This  is a gzipped JSON file with one JSON object per line.
# The keys in the objects are assumed to be strings and can contain any characters, including special characters like commas or colons.
def load_json_data(file):
    data = []

    with gzip.open(file, 'rt', encoding='utf-8') as f:
        for i in f:
            data.append(ast.literal_eval(i))

    df  = pd.DataFrame(data)
    
    return df
df_reviews = load_json_data('../data/raw/user_reviews.json.gz')

In [29]:
df_reviews.head()

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."
2,evcentric,http://steamcommunity.com/id/evcentric,"[{'funny': '', 'posted': 'Posted February 3.',..."
3,doctr,http://steamcommunity.com/id/doctr,"[{'funny': '', 'posted': 'Posted October 14, 2..."
4,maplemage,http://steamcommunity.com/id/maplemage,"[{'funny': '3 people found this review funny',..."


In [30]:

# explode column reviews 
df_reviews_exploded = df_reviews.explode('reviews')

# Normalize data
df_reviews_desanidado = pd.json_normalize(df_reviews_exploded['reviews'].dropna())

# Reset indexes
df_reviews_desanidado.reset_index(inplace=True)
df_reviews_exploded.reset_index(inplace=True)

# Concatenate dataframes
user_reviews = pd.concat([df_reviews_exploded, df_reviews_desanidado], axis=1)
df_reviews = user_reviews.drop(columns=['reviews'])

# show new dataframe
df_reviews

Unnamed: 0,index,user_id,user_url,index.1,funny,posted,last_edited,item_id,helpful,recommend,review
0,0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,0.0,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,1.0,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.
2,0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,2.0,,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...
3,1,js41637,http://steamcommunity.com/id/js41637,3.0,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
4,1,js41637,http://steamcommunity.com/id/js41637,4.0,,"Posted September 8, 2013.",,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...
...,...,...,...,...,...,...,...,...,...,...,...
59328,25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,,,,,,,,
59329,25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,,,,,,,,
59330,25798,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,,,,,,,,
59331,25798,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,,,,,,,,


### Transform

In [31]:
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59333 entries, 0 to 59332
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   index        59333 non-null  int64  
 1   user_id      59333 non-null  object 
 2   user_url     59333 non-null  object 
 3   index        59305 non-null  float64
 4   funny        59305 non-null  object 
 5   posted       59305 non-null  object 
 6   last_edited  59305 non-null  object 
 7   item_id      59305 non-null  object 
 8   helpful      59305 non-null  object 
 9   recommend    59305 non-null  object 
 10  review       59305 non-null  object 
dtypes: float64(1), int64(1), object(9)
memory usage: 5.0+ MB


#### drop nulls and unneeded columns

In [32]:
df_reviews.drop(columns=['index','last_edited','funny','helpful', 'user_url'], axis=1, inplace=True)

In [33]:
df_reviews.isnull().sum()

user_id       0
posted       28
item_id      28
recommend    28
review       28
dtype: int64

In [34]:
df_reviews.dropna(inplace=True)

In [35]:
df_reviews.isnull().sum()

user_id      0
posted       0
item_id      0
recommend    0
review       0
dtype: int64

#### drop duplicates

In [36]:
df_reviews.duplicated().sum()

144

In [37]:
df_reviews.drop_duplicates(inplace=True)

In [38]:
df_reviews.duplicated().sum()

0

In [39]:
df_reviews


Unnamed: 0,user_id,posted,item_id,recommend,review
0,76561197970982479,"Posted November 5, 2011.",1250,True,Simple yet with great replayability. In my opi...
1,76561197970982479,"Posted July 15, 2011.",22200,True,It's unique and worth a playthrough.
2,76561197970982479,"Posted April 21, 2011.",43110,True,Great atmosphere. The gunplay can be a bit chu...
3,js41637,"Posted June 24, 2014.",251610,True,I know what you think when you see this title ...
4,js41637,"Posted September 8, 2013.",227300,True,For a simple (it's actually not all that simpl...
...,...,...,...,...,...
59300,Fuckfhaisjnsnsjakaka,Posted July 10.,70,True,a must have classic from steam definitely wort...
59301,3214213216,Posted July 8.,362890,True,this game is a perfect remake of the original ...
59302,ChrisCoroner,Posted July 3.,273110,True,had so much fun plaing this and collecting res...
59303,CaptainAmericaCw,Posted July 20.,730,True,:D


#### posted

In [40]:
def convertir_string_fecha(date_string):
    # Dividir la cadena en palabras
    words = date_string.split()
    
    if len(words) < 4:
        # Si no hay suficientes palabras en la cadena, devuelve una fecha nula
        return pd.NaT

    month = words[1]
    day = int(words[2].rstrip(','))
    year = int(words[3].rstrip('.'))
    
    date_format = f"{year}, {month}, {day}"
    
    try:
        return pd.to_datetime(date_format, format='%Y, %B, %d')
    except ValueError:
        # Si la conversión falla, devuelve una fecha nula
        return pd.NaT

# Aplicar la función a la columna completa
df_reviews['posted'] = df_reviews['posted'].apply(convertir_string_fecha)

In [41]:
df_reviews.isnull().sum()

user_id          0
posted       10091
item_id          0
recommend        0
review           0
dtype: int64

In [42]:
df_reviews.dropna(subset=['posted'], inplace=True)

In [43]:
df_reviews

Unnamed: 0,user_id,posted,item_id,recommend,review
0,76561197970982479,2011-11-05,1250,True,Simple yet with great replayability. In my opi...
1,76561197970982479,2011-07-15,22200,True,It's unique and worth a playthrough.
2,76561197970982479,2011-04-21,43110,True,Great atmosphere. The gunplay can be a bit chu...
3,js41637,2014-06-24,251610,True,I know what you think when you see this title ...
4,js41637,2013-09-08,227300,True,For a simple (it's actually not all that simpl...
...,...,...,...,...,...
59252,llDracuwulf,2015-10-14,730,True,its FUNNNNNNNN
59255,76561198223837952,2015-10-10,253980,True,Awesome fantasy game if you don't mind the gra...
59265,76561198229845636,2015-10-31,730,True,Prettyy Mad Game
59267,76561198232478272,2015-12-14,730,True,AMAZING GAME 10/10


In [44]:
df_reviews = df_reviews.astype({'posted': 'datetime64[ns]'})

#### recommend

In [45]:
df_reviews = df_reviews.astype({'recommend': 'bool'})

### Load

In [46]:
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
Index: 49070 entries, 0 to 59276
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   user_id    49070 non-null  object        
 1   posted     49070 non-null  datetime64[ns]
 2   item_id    49070 non-null  object        
 3   recommend  49070 non-null  bool          
 4   review     49070 non-null  object        
dtypes: bool(1), datetime64[ns](1), object(3)
memory usage: 1.9+ MB


In [47]:
#load a new processed dataset for EDA
df_reviews.to_csv('../data/processed/user_reviews_processed.csv',  index=False)

In [None]:
pq.write_table(pa.Table.from_pandas(df_reviews), '../data/df_reviews.parquet')

# users_items.json.gz

### Extract


In [49]:
# This  is a gzipped JSON file with one JSON object per line.
# The keys in the objects are assumed to be strings and can contain any characters, including special characters like commas or colons.
data_items = []
for i in gzip.open('../data/raw/users_items.json.gz'):
    data_items.append(ast.literal_eval(i.decode('utf-8')))
df_items = pd.DataFrame(data_items)

In [50]:
df_items.head()

Unnamed: 0,user_id,items_count,steam_id,user_url,items
0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
1,js41637,888,76561198035864385,http://steamcommunity.com/id/js41637,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
2,evcentric,137,76561198007712555,http://steamcommunity.com/id/evcentric,"[{'item_id': '1200', 'item_name': 'Red Orchest..."
3,Riot-Punch,328,76561197963445855,http://steamcommunity.com/id/Riot-Punch,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
4,doctr,541,76561198002099482,http://steamcommunity.com/id/doctr,"[{'item_id': '300', 'item_name': 'Day of Defea..."


In [51]:
# check data content in 'items'
df_items['items'][0]

[{'item_id': '10',
  'item_name': 'Counter-Strike',
  'playtime_forever': 6,
  'playtime_2weeks': 0},
 {'item_id': '20',
  'item_name': 'Team Fortress Classic',
  'playtime_forever': 0,
  'playtime_2weeks': 0},
 {'item_id': '30',
  'item_name': 'Day of Defeat',
  'playtime_forever': 7,
  'playtime_2weeks': 0},
 {'item_id': '40',
  'item_name': 'Deathmatch Classic',
  'playtime_forever': 0,
  'playtime_2weeks': 0},
 {'item_id': '50',
  'item_name': 'Half-Life: Opposing Force',
  'playtime_forever': 0,
  'playtime_2weeks': 0},
 {'item_id': '60',
  'item_name': 'Ricochet',
  'playtime_forever': 0,
  'playtime_2weeks': 0},
 {'item_id': '70',
  'item_name': 'Half-Life',
  'playtime_forever': 0,
  'playtime_2weeks': 0},
 {'item_id': '130',
  'item_name': 'Half-Life: Blue Shift',
  'playtime_forever': 0,
  'playtime_2weeks': 0},
 {'item_id': '300',
  'item_name': 'Day of Defeat: Source',
  'playtime_forever': 4733,
  'playtime_2weeks': 0},
 {'item_id': '240',
  'item_name': 'Counter-Strike: S

In [52]:
# Parse nested data in  column 'items' into separate columns
df_items_exploded = df_items.explode('items')

# Normalize unnested data and bring it to a new DataFrame
df_items_desanidado = pd.json_normalize(df_items_exploded['items'])

# Reset indexes
df_items_exploded.reset_index(inplace=True)
df_items_desanidado.reset_index(inplace=True)

# Concatenate  the two DataFrames
user_items = pd.concat([df_items_exploded, df_items_desanidado], axis=1)
df_items = user_items.drop(columns=['items'])

# Show resulting dataframe
df_items

Unnamed: 0,index,user_id,items_count,steam_id,user_url,index.1,item_id,item_name,playtime_forever,playtime_2weeks
0,0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,0,10,Counter-Strike,6.0,0.0
1,0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,1,20,Team Fortress Classic,0.0,0.0
2,0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,2,30,Day of Defeat,7.0,0.0
3,0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,3,40,Deathmatch Classic,0.0,0.0
4,0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,4,50,Half-Life: Opposing Force,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
5170010,88308,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329...,5170010,373330,All Is Dust,0.0,0.0
5170011,88308,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329...,5170011,388490,One Way To Die: Steam Edition,3.0,3.0
5170012,88308,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329...,5170012,521570,You Have 10 Seconds 2,4.0,4.0
5170013,88308,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329...,5170013,519140,Minds Eyes,3.0,3.0


### Transform

In [53]:
df_items.head(10)

Unnamed: 0,index,user_id,items_count,steam_id,user_url,index.1,item_id,item_name,playtime_forever,playtime_2weeks
0,0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,0,10,Counter-Strike,6.0,0.0
1,0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,1,20,Team Fortress Classic,0.0,0.0
2,0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,2,30,Day of Defeat,7.0,0.0
3,0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,3,40,Deathmatch Classic,0.0,0.0
4,0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,4,50,Half-Life: Opposing Force,0.0,0.0
5,0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,5,60,Ricochet,0.0,0.0
6,0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,6,70,Half-Life,0.0,0.0
7,0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,7,130,Half-Life: Blue Shift,0.0,0.0
8,0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,8,300,Day of Defeat: Source,4733.0,0.0
9,0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,9,240,Counter-Strike: Source,1853.0,0.0


In [54]:
df_items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5170015 entries, 0 to 5170014
Data columns (total 10 columns):
 #   Column            Dtype  
---  ------            -----  
 0   index             int64  
 1   user_id           object 
 2   items_count       int64  
 3   steam_id          object 
 4   user_url          object 
 5   index             int64  
 6   item_id           object 
 7   item_name         object 
 8   playtime_forever  float64
 9   playtime_2weeks   float64
dtypes: float64(2), int64(3), object(5)
memory usage: 394.4+ MB


#### drop nulls and unnecesary columns

In [55]:
df_items.drop(columns=['index','index','user_url','steam_id','playtime_2weeks'],axis=1 ,inplace=True)

In [56]:
df_items.isnull().sum()

user_id                 0
items_count             0
item_id             16806
item_name           16806
playtime_forever    16806
dtype: int64

In [57]:
df_items.dropna(inplace=True)

In [58]:
df_items.isnull().sum()

user_id             0
items_count         0
item_id             0
item_name           0
playtime_forever    0
dtype: int64

In [59]:
df_items

Unnamed: 0,user_id,items_count,item_id,item_name,playtime_forever
0,76561197970982479,277,10,Counter-Strike,6.0
1,76561197970982479,277,20,Team Fortress Classic,0.0
2,76561197970982479,277,30,Day of Defeat,7.0
3,76561197970982479,277,40,Deathmatch Classic,0.0
4,76561197970982479,277,50,Half-Life: Opposing Force,0.0
...,...,...,...,...,...
5170009,76561198329548331,7,346330,BrainBread 2,0.0
5170010,76561198329548331,7,373330,All Is Dust,0.0
5170011,76561198329548331,7,388490,One Way To Die: Steam Edition,3.0
5170012,76561198329548331,7,521570,You Have 10 Seconds 2,4.0


In [60]:
df_items.duplicated().sum()

59117

In [61]:
df_items.drop_duplicates(keep= 'first',inplace=True)

In [62]:
df_items

Unnamed: 0,user_id,items_count,item_id,item_name,playtime_forever
0,76561197970982479,277,10,Counter-Strike,6.0
1,76561197970982479,277,20,Team Fortress Classic,0.0
2,76561197970982479,277,30,Day of Defeat,7.0
3,76561197970982479,277,40,Deathmatch Classic,0.0
4,76561197970982479,277,50,Half-Life: Opposing Force,0.0
...,...,...,...,...,...
5170009,76561198329548331,7,346330,BrainBread 2,0.0
5170010,76561198329548331,7,373330,All Is Dust,0.0
5170011,76561198329548331,7,388490,One Way To Die: Steam Edition,3.0
5170012,76561198329548331,7,521570,You Have 10 Seconds 2,4.0


### Load 

In [63]:
#load a new processed dataset for EDA
df_items.to_csv('../data/processed/users_items_processed.csv',  index=False, encoding='utf-8')

In [64]:
pq.write_table(pa.Table.from_pandas(df_items), '../data/df_items.parquet')