In [1]:
import os
import argparse
import pandas as pd
import numpy as np
import pickle
import logging
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

logger = logging.getLogger(__name__)


class TfidfTrainer:
    def __init__(self):
        self.tf_transformer = None
        self.text_vectors = None

    # Train TF-IDF model
    def train(self, corpus):
        logger.info("Training model")
        tf = TfidfVectorizer(analyzer='word', ngram_range=(1,2), stop_words = "english", lowercase = True, max_features = 50000)
        self.tf_transformer = tf.fit(corpus)
        self.text_vectors = tf.fit_transform(corpus).toarray()

    # Returns the trained sentence vectors
    def get_vectors(self):
        return self.text_vectors

    # Save the model in pickle format
    def save(self, path):
        logger.info("Saving model to %s", path)
        # Save TfidfVectoriser vocab
        with open(path + '/tfidf_transform.pkl', "wb") as pickleFile:
            pickle.dump(self.tf_transformer, pickleFile)
        # Save corpus text vectors
        logger.info("Saving vectors to %s", path)
        with open(path +  '/tfidf_matrix.pkl', "wb") as pickleFile:
            pickle.dump(self.text_vectors, pickleFile)


class Dataset:
    def __init__(self, dir):
        self.dir = dir
        self.df_raw = self.__read_data__()
        self.df = self.__process_data__()

    # Returns the dataset corpus
    def get_corpus(self):
        return self.df['description'].to_list()

    # Returns corpus labels
    def get_labels(self):
        return self.df['variety_region']

    # Returns dataset
    def get_df(self):
        return self.df

    # Saves the dataframe as a pickle
    def save(self, dir):
        columns = ['title', 'description', 'points', 'price', 'variety', 'region_1']
        self.df[columns].to_pickle(dir + '/tfidf_metadata.pkl')

    # Read in dataframe
    def __read_data__(self):
        path = self.dir + '/sample.csv'
        # path = self.dir + '/sample_10000.csv'
        # path = self.dir + '/winemag-data-130k-v2.csv'
        logger.info("Loading data from %s", path)
        if not (os.path.isfile(path)):
            raise ValueError(path)
        return pd.read_csv(path)

    # Process dataframe
    def __process_data__(self):
        # Remove nans from important columns
        df = self.df_raw[(self.df_raw['variety'].notna()) & (self.df_raw['region_1'].notna())]
        df = df.reset_index(drop=True)
        # Create new feature variety + region_1
        df['variety_region'] = df[['variety', 'region_1']].agg('-'.join, axis=1)
        return df



In [2]:
args = {
    'data_dir': '../data/raw',
    'model_dir': '../models'
}

# Read training data
dataset = Dataset(args['data_dir'])
corpus = dataset.get_corpus()

# Train TF-IDF model
model = TfidfTrainer()
model.train(corpus)

# Save run
model.save(args['model_dir'])
dataset.save(args['model_dir'])

# Validate
# valid = Validation()
# valid.plot_pca(model.get_vectors(), dataset.get_labels())

In [3]:
x = model.get_vectors()
df = dataset.get_df()
y = df['variety_region']

In [4]:
x

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [5]:


feat_cols = [ 'embedding'+str(i) for i in range(x.shape[1]) ]
df_embed = pd.DataFrame(x,columns=feat_cols)
df_embed['y'] = y

pca = PCA(n_components=2)
pca_result = pca.fit_transform(x)
df_embed['pca-one'] = pca_result[:,0]
df_embed['pca-two'] = pca_result[:,1]

df_embed['size'] = 0
df_embed.loc[df_embed['y'] != 'unknown','size'] = 10


plt.figure(figsize=(16,10))
sns.scatterplot(
    x="pca-one", y="pca-two",
    hue="y",
    data=df_embed,
    legend="full",
    alpha=0.8
)


<matplotlib.axes._subplots.AxesSubplot at 0x1296e1670>