In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

We provide the following dataset (ASSIGNMENT_2.csv):
    
| Composition title | Composition Writers | Recording Title | Recording Writes |Action |
| --- | --- |--- | --- |--- |
| Yellow submarine | Leo Ouha |Yellow submarine(remix) |Leo Ouha |Leo Ouha |ACCEPTED |
| Shape of you | Ed Sheeran| Anaconda | Mick George | Roco Selto |Leo REJECTED |


Train an ML/DL model for pair matching of compositions and recordings.
Report and evaluate the results.

In [2]:
input_dataset = pd.read_csv("dataset/ASSIGNMENT_2.csv")

In [3]:
input_dataset = input_dataset.drop_duplicates().dropna()

for col in list(input_dataset.columns):
    input_dataset.loc[:, col] = input_dataset.loc[:, col].apply(lambda x: x.lower())

print(input_dataset["Action"].str.lower().unique())

['rejected' 'accepted' 'no decision' 'no_decision']


In [4]:
input_dataset.loc[input_dataset["Action"] == 'no decision', "Action"] = "no_decision"

In [5]:
input_dataset = input_dataset.loc[input_dataset["Action"]!="no_decision", :]

In [6]:
# input_dataset.dropna().drop_duplicates().shape

In [7]:
# oe_style = OneHotEncoder()
# oe_results = oe_style.fit_transform(input_dataset[["Action"]])
# oh_labels = pd.DataFrame(oe_results.toarray(), columns=oe_style.categories_)
# oh_labels.sum()

In [8]:
labelizer = LabelEncoder()
lb_res = labelizer.fit_transform(input_dataset.loc[:, "Action"])

input_dataset.loc[:, "Action"]= lb_res
print(Counter(input_dataset["Action"]))

Counter({0: 1485, 1: 522})


In [9]:
# in this kind of task we have to keep stopwords and punctuation it is important on these kind of texts (just like sentiment analysis)

In [10]:
# For starters we need to create a baseline to beat. So we could do a first run with a simple logistic regression and
# an XGBoost

In [11]:
input_dataset.head(2)

Unnamed: 0,Composition Title,Composition Writers,Recording Title,Recording Writers,Recording Artist,Action
0,kokaina,yassine baybah|daniel dlouhy,kokaina,a baybah c dlouhy,miami yacine,1
1,por estar contigo,"martinez escamilla,felipe de jesus",estar contigo,martinez de ubago rodriguez alejandro,"alex, jorge y lena",1


In order to run any kind of machine learning model, we need to transform the features into tf-idf vectors.

If we had more data i would've trained embeddings from the dataset.

In [12]:
def vectorizer(dataset):
    tfidf_vect = dict()
    tfidf_wm = dict()
    for col in dataset.columns:
        tfidf_vect[col] = TfidfVectorizer(analyzer='word', stop_words="english")
        tfidf_wm[col] = tfidf_vect[col].fit_transform(dataset[col])
    return tfidf_vect, tfidf_wm

def infer_vector(dataset, tfidf_vect):
    tfidf_wm = dict()
    for col in dataset.columns:
        tfidf_wm[col] = tfidf_vect[col].fit_transform(dataset[col])
    return tfidf_wm

In [13]:
feature_cols = list(input_dataset.loc[:, input_dataset.columns!="Action"].columns)

In [14]:
input_dataset.loc[:, feature_cols].describe()

Unnamed: 0,Composition Title,Composition Writers,Recording Title,Recording Writers,Recording Artist
count,2007,2007,2007,2007,2007
unique,1802,1835,1897,1812,1660
top,flower of scotland,roy murdoch buchanan williamson,flower of scotland,traditional,various artists
freq,9,9,9,13,9


In [15]:
lala = TfidfVectorizer(analyzer='word', stop_words="english")


In [16]:
input_dataset.loc[:, feature_cols[0]] =  input_dataset.loc[:, feature_cols[1:]].apply(lambda x: ",".join(x), axis=1)

In [17]:
input_dataset = pd.DataFrame(input_dataset.loc[:, feature_cols[0]])

In [18]:
x1  = lala.fit_transform(input_dataset.iloc[:,0].tolist())

In [19]:
lb_res

array([1, 1, 1, ..., 0, 0, 0])

In [20]:
X_train, X_test, y_train, y_test = train_test_split(x1, lb_res, shuffle=True, random_state=42, test_size=0.2)

In [21]:
model = XGBClassifier()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(f1_score(y_pred, y_test))



0.3176470588235294


In [22]:
confusion_matrix(y_pred, y_test)

array([[259,  89],
       [ 27,  27]])