In [1]:
import os
import pandas as pd
import numpy as np

ARTICLES_PATH = '/media/hdd/training_data/transfer-learning-sed/'

In [2]:
def load_data(path):
    csvs = os.listdir(path)
    csvs.remove('sample_submission.csv')
    csvs.remove('test.csv')
    articles_paths = [os.path.join(path, csv) for csv in csvs]
    X_titles = []
    X_contents = []
    y = []
    for article_path in articles_paths:
        new_articles = pd.read_csv(article_path)
        X_titles.extend(list(new_articles['title'].copy()))
        X_contents.extend(list(new_articles['content'].copy()))
        y.extend(list(new_articles['tags'].copy()))

    return X_titles, X_contents, y

In [3]:
X1, X2, y = load_data(ARTICLES_PATH)
print('Titles:\n')
print('\n'.join(X1[:3]))
print('Contents:\n')
print('******\n'.join(X2[:3]))
print('Tags:\n')
print('; '.join(y[:3]))

Titles:

How can I get chewy chocolate chip cookies?
How should I cook bacon in an oven?
What is the difference between white and brown eggs?
Contents:

<p>My chocolate chips cookies are always too crisp. How can I get chewy cookies, like those of Starbucks?</p>

<hr>

<p>Thank you to everyone who has answered. So far the tip that had the biggest impact was to chill and rest the dough, however I also increased the brown sugar ratio and increased a bit the butter. Also adding maple syrup helped. </p>
******
<p>I've heard of people cooking bacon in an oven by laying the strips out on a cookie sheet. When using this method, how long should I cook the bacon for, and at what temperature?</p>
******
<p>I always use brown extra large eggs, but I can't honestly say why I do this other than habit at this point. Are there any distinct advantages or disadvantages like flavor, shelf life, etc?</p>

Tags:

baking cookies texture; oven cooking-time bacon; eggs


In [4]:
X2[0]

'<p>My chocolate chips cookies are always too crisp. How can I get chewy cookies, like those of Starbucks?</p>\n\n<hr>\n\n<p>Thank you to everyone who has answered. So far the tip that had the biggest impact was to chill and rest the dough, however I also increased the brown sugar ratio and increased a bit the butter. Also adding maple syrup helped. </p>\n'

# Cleanup

In [5]:
import re
html_tags = '<[^>]*>'

In [6]:
# remove html tags
X2 = [re.sub(html_tags, "", content) for content in X2]

In [7]:
X2[0]

'My chocolate chips cookies are always too crisp. How can I get chewy cookies, like those of Starbucks?\n\n\n\nThank you to everyone who has answered. So far the tip that had the biggest impact was to chill and rest the dough, however I also increased the brown sugar ratio and increased a bit the butter. Also adding maple syrup helped. \n'

In [8]:
# remove new line
X2 = [re.sub('\\n', " ", content) for content in X2]

In [9]:
X2[0]

'My chocolate chips cookies are always too crisp. How can I get chewy cookies, like those of Starbucks?    Thank you to everyone who has answered. So far the tip that had the biggest impact was to chill and rest the dough, however I also increased the brown sugar ratio and increased a bit the butter. Also adding maple syrup helped.  '

In [10]:
# remove numbers, commas, dots, etc.
X2 = [re.sub("[^a-zA-Z]", " ", content) for content in X2]

In [11]:
X2[0]

'My chocolate chips cookies are always too crisp  How can I get chewy cookies  like those of Starbucks     Thank you to everyone who has answered  So far the tip that had the biggest impact was to chill and rest the dough  however I also increased the brown sugar ratio and increased a bit the butter  Also adding maple syrup helped   '

In [12]:
# remove excess whitespaces
X2 = [re.sub("\s+"," ", content) for content in X2]

In [13]:
X2[0]

'My chocolate chips cookies are always too crisp How can I get chewy cookies like those of Starbucks Thank you to everyone who has answered So far the tip that had the biggest impact was to chill and rest the dough however I also increased the brown sugar ratio and increased a bit the butter Also adding maple syrup helped '

In [14]:
y = [tags.split(' ') for tags in y]

# Stemming

In [15]:
from nltk.stem.snowball import SnowballStemmer

In [16]:
stemmer = SnowballStemmer('english')

In [17]:
X2 = [[stemmer.stem(word) for word in words.split(' ') if word is not ''] for words in X2]

# Removing stopwords

In [18]:
from nltk.corpus import stopwords

In [19]:
stopwords = set(stopwords.words('english'))

In [20]:
', '.join(X2[0])

'my, chocol, chip, cooki, are, alway, too, crisp, how, can, i, get, chewi, cooki, like, those, of, starbuck, thank, you, to, everyon, who, has, answer, so, far, the, tip, that, had, the, biggest, impact, was, to, chill, and, rest, the, dough, howev, i, also, increas, the, brown, sugar, ratio, and, increas, a, bit, the, butter, also, ad, mapl, syrup, help'

In [21]:
X2 = [[word for word in words if word not in stopwords] for words in X2]

In [22]:
', '.join(X2[0])

'chocol, chip, cooki, alway, crisp, get, chewi, cooki, like, starbuck, thank, everyon, answer, far, tip, biggest, impact, chill, rest, dough, howev, also, increas, brown, sugar, ratio, increas, bit, butter, also, ad, mapl, syrup, help'

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
X2 = [' '.join(words) for words in X2]

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X2, y, test_size=0.2)

# Vectorizing

In [41]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [42]:
vectorizer = CountVectorizer(analyzer='word', min_df=0.005, ngram_range=(1, 3))
tfidf_transformer = TfidfTransformer(norm='l1', use_idf=True)

In [43]:
vectorizer.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=0.005,
        ngram_range=(1, 3), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [44]:
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)