In [0]:
# In this notebook we preprocess the attributes.csv and finally we save the preprocessed file as df_attrs_prep.pkl
# Finally we save a final preprocessed file that will be used in the feature engineering : df_attrs_prep.pkl

In [0]:
import pandas as pd

# print non truncated column info in pandas dataframe
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', 500)

In [11]:
!pip install -U nltk

from nltk.tokenize import RegexpTokenizer

from nltk.corpus import stopwords

from nltk.stem.porter import *

Requirement already up-to-date: nltk in /usr/local/lib/python3.6/dist-packages (3.4.3)


AttributeError: ignored

## - Preprocess attributes.csv

In [0]:
# read the the file with the product descriptions
df_attrs = pd.read_csv(r'https://bitbucket.org/dimitrisor/nlp_coursework_relevance_score_prediction/raw/4b81408f87a4f72746c45d47c5c385437df1ec5b/attributes.csv', encoding='latin-1')
# read the trainset
df_train = pd.read_csv(r'https://bitbucket.org/dimitrisor/nlp_coursework_relevance_score_prediction/raw/4b81408f87a4f72746c45d47c5c385437df1ec5b/train.csv', encoding='latin-1')

In [0]:
df_attrs.head()

Unnamed: 0,product_uid,name,value
0,100001.0,Bullet01,Versatile connector for various 90Â° connections and home repair projects
1,100001.0,Bullet02,Stronger than angled nailing or screw fastening alone
2,100001.0,Bullet03,Help ensure joints are consistently straight and strong
3,100001.0,Bullet04,Dimensions: 3 in. x 3 in. x 1-1/2 in.
4,100001.0,Bullet05,Made from 12-Gauge steel


In [0]:
# keep only products that appear in the trainset
df_attrs = df_attrs[df_attrs.product_uid.isin(df_train.product_uid.unique())]

In [0]:
df_attrs.shape

(932000, 3)

In [0]:
df_attrs.product_uid.nunique()

38404

In [0]:
df_attrs.nunique()

product_uid    38404 
name           4959  
value          173774
dtype: int64

### -- Check the coverage of attributes

The target is to keep only attributes that appear in almost all the products.

In [0]:
# just check the attributes that appear in many products
df_attr_names = df_attrs.groupby('name').count().sort_values(by=['product_uid'], ascending=False)
df_attr_names['Coverage'] = df_attr_names['product_uid']/df_attrs.product_uid.nunique()
df_attr_names.head(10)

Unnamed: 0_level_0,product_uid,value,Coverage
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MFG Brand Name,38396,38380,0.999792
Bullet02,38394,38394,0.99974
Bullet03,38380,38380,0.999375
Bullet04,38347,38347,0.998516
Bullet01,38257,38257,0.996172
Bullet05,27741,27741,0.722347
Product Width (in.),27291,27291,0.710629
Product Height (in.),24639,24639,0.641574
Product Depth (in.),24220,24220,0.630663
Bullet06,21032,21032,0.547651


In [0]:
# group all attributes for each product
df_attrs_groups = df_attrs.groupby(['product_uid'], as_index=True)['value'].apply(list).reset_index()
df_attrs_groups.columns = ['product_uid', 'value_all']
df_attrs_groups.head(1)

Unnamed: 0,product_uid,value_all
0,100001.0,"[Versatile connector for various 90Â° connections and home repair projects, Stronger than angled nailing or screw fastening alone, Help ensure joints are consistently straight and strong, Dimensions: 3 in. x 3 in. x 1-1/2 in., Made from 12-Gauge steel, Galvanized for extra corrosion resistance, Install with 10d common nails or #9 x 1-1/2 in. Strong-Drive SD screws, 12, Galvanized Steel, Simpson Strong-Tie, 1, 1.5, 3, 0.26, 3]"


In [0]:
# group the 5 important attributes for each product
df_attrs_5 = df_attrs[df_attrs.name.isin(['MFG Brand Name', 'Bullet02', 'Bullet03', 'Bullet04', 'Bullet01'])]
df_attrs_groups_5 = df_attrs_5.groupby(['product_uid'], as_index=True)['value'].apply(list).reset_index()
df_attrs_groups_5.columns = ['product_uid', 'value_5']
df_attrs_groups_5.head(1)

Unnamed: 0,product_uid,value_5
0,100001.0,"[Versatile connector for various 90Â° connections and home repair projects, Stronger than angled nailing or screw fastening alone, Help ensure joints are consistently straight and strong, Dimensions: 3 in. x 3 in. x 1-1/2 in., Simpson Strong-Tie]"


In [0]:
# merge the above dataframes
df_attrs_groups = df_attrs_groups.merge(df_attrs_groups_5, left_on='product_uid', right_on='product_uid', how='left')

In [0]:
df_attrs_groups.head(1)

Unnamed: 0,product_uid,value_all,value_5
0,100001.0,"[Versatile connector for various 90Â° connections and home repair projects, Stronger than angled nailing or screw fastening alone, Help ensure joints are consistently straight and strong, Dimensions: 3 in. x 3 in. x 1-1/2 in., Made from 12-Gauge steel, Galvanized for extra corrosion resistance, Install with 10d common nails or #9 x 1-1/2 in. Strong-Drive SD screws, 12, Galvanized Steel, Simpson Strong-Tie, 1, 1.5, 3, 0.26, 3]","[Versatile connector for various 90Â° connections and home repair projects, Stronger than angled nailing or screw fastening alone, Help ensure joints are consistently straight and strong, Dimensions: 3 in. x 3 in. x 1-1/2 in., Simpson Strong-Tie]"


In [0]:
# merge the text of each attribute to a common text
df_attrs_groups['Atrr_text_all'] = df_attrs_groups['value_all'].apply(lambda x: ' '.join(str(i) for i in x))
df_attrs_groups['Atrr_text_5'] = df_attrs_groups['value_5'].apply(lambda x: ' '.join(str(i) for i in x))

In [0]:
# convert to lower case
df_attrs_groups['Atrr_text_all'] = df_attrs_groups['Atrr_text_all'].apply(lambda text: text.lower())
df_attrs_groups['Atrr_text_5'] = df_attrs_groups['Atrr_text_5'].apply(lambda text: text.lower())

In [0]:
del df_attrs_groups['value_all']
del df_attrs_groups['value_5']

In [0]:
# remove punctuation and tokenize
# create a new column with the tokens
tokenizer = RegexpTokenizer(r'\w+')
df_attrs_groups['Atrr_tokens'] = df_attrs_groups['Atrr_text_5'].apply(lambda text: tokenizer.tokenize(text))

In [0]:
# remove stopwords
stop_words = set(stopwords.words('english'))
df_attrs_groups['Atrr_tokens_sw'] = df_attrs_groups['Atrr_tokens'].apply(lambda tokens: [i for i in tokens if i not in stop_words])

In [0]:
# create a column wiht the tokens as text (without stopwords)
#df_attrs_groups['Atrr_text'] = df_attrs_groups['Atrr_tokens_sw'].apply(lambda tokens: ' '.join(tokens))

In [0]:
# stemming
stemmer = PorterStemmer()
df_attrs_groups['Atrr_stem'] = df_attrs_groups['Atrr_tokens_sw'].apply(lambda tokens: [stemmer.stem(token) for token in tokens])

In [0]:
df_attrs_groups.head(1)

Unnamed: 0,product_uid,Atrr_text_all,Atrr_text_5,Atrr_tokens,Atrr_tokens_sw,Atrr_stem
0,100001.0,versatile connector for various 90â° connections and home repair projects stronger than angled nailing or screw fastening alone help ensure joints are consistently straight and strong dimensions: 3 in. x 3 in. x 1-1/2 in. made from 12-gauge steel galvanized for extra corrosion resistance install with 10d common nails or #9 x 1-1/2 in. strong-drive sd screws 12 galvanized steel simpson strong-tie 1 1.5 3 0.26 3,versatile connector for various 90â° connections and home repair projects stronger than angled nailing or screw fastening alone help ensure joints are consistently straight and strong dimensions: 3 in. x 3 in. x 1-1/2 in. simpson strong-tie,"[versatile, connector, for, various, 90â, connections, and, home, repair, projects, stronger, than, angled, nailing, or, screw, fastening, alone, help, ensure, joints, are, consistently, straight, and, strong, dimensions, 3, in, x, 3, in, x, 1, 1, 2, in, simpson, strong, tie]","[versatile, connector, various, 90â, connections, home, repair, projects, stronger, angled, nailing, screw, fastening, alone, help, ensure, joints, consistently, straight, strong, dimensions, 3, x, 3, x, 1, 1, 2, simpson, strong, tie]","[versatil, connector, variou, 90â, connect, home, repair, project, stronger, angl, nail, screw, fasten, alon, help, ensur, joint, consist, straight, strong, dimens, 3, x, 3, x, 1, 1, 2, simpson, strong, tie]"


In [0]:
df_attrs_groups.columns

Index(['product_uid', 'Atrr_text_all', 'Atrr_text_5', 'Atrr_tokens',
       'Atrr_tokens_sw', 'Atrr_stem'],
      dtype='object')

In [0]:
df_attrs_groups2 = df_attrs_groups[['product_uid', 'Atrr_text_all', 'Atrr_stem']]
df_attrs_groups2.columns

In [0]:
df_attrs_groups2.to_pickle(r'C:\Users\Dimos\Desktop\MSc\Semester 4\NLP\Coursework\data\preprocessed\df_attrs_prep.pkl')