<a href="https://colab.research.google.com/github/grigorjevas/Discogs-price-prediction/blob/main/Preparing%20data%20and%20models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Modelling Discogs Marketplace price predictions

## EDA and selecting data

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LinearRegression

import pickle

In [3]:
df = pd.read_csv("/content/drive/MyDrive/Data/electro_raw_data.csv", parse_dates=["release_date"], na_values="N/A")

In [4]:
df.head()

Unnamed: 0,artist,title,label,release_format,number_of_tracks,release_date,price,rating,votes,have,want,limited_edition,media_condition,sleeve_condition,release_page
0,Quintron And Miss Pussycat,Live At Third Man Records,Third Man Records ‎– TMR 147,LP,10,2012-03-12,19.0,5.0,1.0,9,10,0,Mint (M),Mint (M),https://discogs.com/Quintron-And-Miss-Pussycat...
1,Jean-Michel Jarre,The Concerts In China,"PGP RTB ‎– 3220117, Polydor ‎– DLP 2612 039",LP,15,1982-01-01,21.27,4.0,23.0,204,53,0,Very Good Plus (VG+),Very Good (VG),https://discogs.com/Jean-Michel-Jarre-The-Conc...
2,MC T-Rock,Introducing,M-Pire Records ‎– MPR 9901.4,"12""",4,1999-01-01,21.34,4.42,24.0,107,78,0,Very Good Plus (VG+),Generic,https://discogs.com/MC-T-Rock-Introducing/rele...
3,E.T.M.S.,Sounds Of Humanoid Kind,Musix Records ‎– SMX 11004,"12""",13,1983-01-01,24.19,4.4,72.0,256,267,1,Very Good Plus (VG+),Very Good Plus (VG+),https://discogs.com/ETMS-Sounds-Of-Humanoid-Ki...
4,The Art Of Noise,In Visible Silence,"Chrysalis ‎– BFV 41528, China Records ‎– BFV 4...",LP,11,1986-01-01,24.14,4.01,109.0,896,123,0,Very Good Plus (VG+),Very Good Plus (VG+),https://discogs.com/The-Art-Of-Noise-In-Visibl...


In [5]:
df.shape

(4998, 15)

Convert release date to year

In [6]:
df["release_year"] = df['release_date'].dt.year

Parse item condition to numerical values

In [7]:
# Parses item condition value to numerical values
def parse_item_condition_to_int(condition: str) -> int:
    return {
        "Poor (P)": 0,
        "Fair (F)": 0,
        "Good (G)": 1,
        "Good Plus (G+)": 1,
        "Very Good (VG)": 2,
        "Very Good Plus (VG+)": 3,
        "Generic": 3,
        "Not Graded": 3,
        "No Cover": 3,
        "Near Mint (NM or M-)": 4,
        "Mint (M)": 5
    }[condition]

In [8]:
df["media_condition"] = df["media_condition"].apply(
    lambda cond: parse_item_condition_to_int(cond))
df["sleeve_condition"] = df["sleeve_condition"].apply(
    lambda cond: parse_item_condition_to_int(cond))

Drop N/A rows 

In [9]:
df = df.dropna()

Drop unneeded columns

In [10]:
df = df.drop(["label", "release_date", "release_page"], axis="columns")

In [11]:
df.head()

Unnamed: 0,artist,title,release_format,number_of_tracks,price,rating,votes,have,want,limited_edition,media_condition,sleeve_condition,release_year
0,Quintron And Miss Pussycat,Live At Third Man Records,LP,10,19.0,5.0,1.0,9,10,0,5,5,2012.0
1,Jean-Michel Jarre,The Concerts In China,LP,15,21.27,4.0,23.0,204,53,0,3,2,1982.0
2,MC T-Rock,Introducing,"12""",4,21.34,4.42,24.0,107,78,0,3,3,1999.0
3,E.T.M.S.,Sounds Of Humanoid Kind,"12""",13,24.19,4.4,72.0,256,267,1,3,3,1983.0
4,The Art Of Noise,In Visible Silence,LP,11,24.14,4.01,109.0,896,123,0,3,3,1986.0


In [12]:
df.shape

(4841, 13)

## Encoding and scaling data 

In [18]:
one_hot_encoder = OneHotEncoder().fit(df[["release_format"]])

In [19]:
encoded_data = one_hot_encoder.transform(df[["release_format"]]).todense()

In [20]:
scaler = StandardScaler().fit(
    df.drop(["artist", "title", "release_format", "price"], axis="columns"))

In [21]:
scaled_data = scaler.transform(
    df.drop(["artist", "title", "release_format", "price"], axis="columns"))

In [22]:
encoded_features = np.concatenate([encoded_data, scaled_data], axis=-1)

In [23]:
train_features, test_features, train_price, test_price = train_test_split(
    encoded_features, df["price"]
)

In [24]:
model = LinearRegression()
model.fit(train_features, train_price)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)