With the follwoing script, we will practice 1) uploading text csv files using python dataframe library called pandas, 2) cleaning/manipulating texts using several libraries such as beautifulsoup, nltk, or re, and 3) building feature vectors for further analysis. 

# Uploading a csv file

In [7]:
import pandas as pd
df=pd.read_csv('Leon-Skirts-All-Products.csv',sep=',')
print df.shape # outputs the size of the dataframe
df.head()      # outputs the top few lines for a content viewing

(134856, 4)


Unnamed: 0,ProductId,StoredName,ProductName,Description
0,2895702,http://cdn.styloko.com/images/1637984.jpg,Silk skirt,Brown silk skirt from Ralph Lauren Black Label...
1,2895702,http://cdn.styloko.com/images/3443263.jpg,Silk skirt,Brown silk skirt from Ralph Lauren Black Label...
2,2895702,http://cdn.styloko.com/images/3443264.jpg,Silk skirt,Brown silk skirt from Ralph Lauren Black Label...
3,2895702,http://cdn.styloko.com/images/3443265.jpg,Silk skirt,Brown silk skirt from Ralph Lauren Black Label...
4,2895702,http://cdn.styloko.com/images/3443266.jpg,Silk skirt,Brown silk skirt from Ralph Lauren Black Label...


In [8]:
df = df.drop_duplicates('ProductId') #drop duplicate rows based on productId
print df.shape
df.head()

(35948, 4)


Unnamed: 0,ProductId,StoredName,ProductName,Description
0,2895702,http://cdn.styloko.com/images/1637984.jpg,Silk skirt,Brown silk skirt from Ralph Lauren Black Label...
6,3498543,http://cdn.styloko.com/images/2243555.jpg,stretch denim skirt,"- FIT: - slim, straight style - length of side..."
8,3498936,http://cdn.styloko.com/images/2243948.jpg,stretch poly/viscose skirt,"- soft, floaty fabric in a classic blend, virt..."
9,3499123,http://cdn.styloko.com/images/2244135.jpg,business skirt,- a classic outfit component for a smart offic...
11,3499296,http://cdn.styloko.com/images/2244308.jpg,cotton skirt,"-casual all-rounder for leisure, town and holi..."


# Text Cleaning

In [68]:
# Extract product name and description and combine them
df["texts"] = df["ProductName"] + ' ' + df["Description"]
print df.head()
texts = list(df.texts)
print texts[:2]

    ProductId                                 StoredName  \
0     2895702  http://cdn.styloko.com/images/1637984.jpg   
6     3498543  http://cdn.styloko.com/images/2243555.jpg   
8     3498936  http://cdn.styloko.com/images/2243948.jpg   
9     3499123  http://cdn.styloko.com/images/2244135.jpg   
11    3499296  http://cdn.styloko.com/images/2244308.jpg   

                   ProductName  \
0                   Silk skirt   
6          stretch denim skirt   
8   stretch poly/viscose skirt   
9               business skirt   
11                cotton skirt   

                                          Description  \
0   Brown silk skirt from Ralph Lauren Black Label...   
6   - FIT: - slim, straight style - length of side...   
8   - soft, floaty fabric in a classic blend, virt...   
9   - a classic outfit component for a smart offic...   
11  -casual all-rounder for leisure, town and holi...   

                                                texts  
0   Silk skirt Brown silk skirt fro

In [70]:
# (1) remove anything that are not alphanumeric or underscore
# \w will match alphanumeric characters and underscores
# [^\w] will match anything that's not alphanumeric or underscore
import re
regex = re.compile('[^a-zA-Z]')
texts=[regex.sub(' ', str(text)) for text in texts]
# (2) Convert to lowercase
texts = [text.lower() for text in texts]
# (3) Remove repeating characters more than twice. 
#(.)\1+ repleaces any character (.) followed by one or more of the same character. i.e. happpy -> happy
texts = [re.sub(r'(.)\1+', r'\1\1', text) for text in texts]    
# (4) If the first two letters in a word repeats, then keep one of them. i.e. hhappy -> happy
texts = [re.sub(r'\b(\w)\1+', r'\1', text) for text in texts]
# (5) remove stopwords
from nltk.corpus import stopwords
cachedStopWords = stopwords.words("english") #could add additional words to the list if necessary
remove_stopWords = lambda s: ' '.join([w for w in s.split() if w not in cachedStopWords])
texts = [remove_stopWords(text) for text in texts]
# (6) remove extra whitespaces
texts = [re.sub(' +',' ',text) for text in texts]
texts = [text.lstrip() for text in texts]
texts = [text.rstrip() for text in texts]
print texts[:5]


['silk skirt brown silk skirt ralph lauren black label featuring concealed fastening diagonal layers ruffled fabric', 'stretch denim skirt fit slim straight style length side seam vary size size approx cm fabric light stretch denim typical washed effects details woven belt double metal buckle narrower multicoloured woven tape fixed waistband belt loops metal button zip fly five pockets decorative dots embossed back pockets cotton elastane belt polyester cotton', 'stretch poly viscose skirt soft floaty fabric classic blend virtually crease resistant looks smart simple curvy cut length side seam excluding waistband approx cm vary slightly size size zip kick pleat centre back mock back pockets', 'business skirt classic outfit component smart office look side seam length approx cm may vary slightly size size smooth finish soft drape crease resistant line skirt gores beautiful waistband decorative dividing seam stitching zip side seam taffeta lining outer fabric polyester cotton elastane li

# Build a Term-document matrix
### Reference: http://blog.christianperone.com/2011/10/machine-learning-text-feature-extraction-tf-idf-part-ii/

In [90]:
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer()
tf_matrix = count_vectorizer.fit_transform(texts[:5]) #learns term-documnet matrix based on word frequency
terms = count_vectorizer.get_feature_names()
freqs = tf_matrix.sum(axis=0).A1
import operator
print "***Vocabulary frequency from most frequent to least***"
print sorted(zip(terms,freqs), key = lambda x: x[1], reverse = True)

***Vocabulary frequency from most frequent to least
[(u'size', 7), (u'skirt', 7), (u'cotton', 6), (u'fabric', 6), (u'belt', 5), (u'pockets', 5), (u'seam', 5), (u'side', 5), (u'approx', 4), (u'back', 4), (u'cm', 4), (u'length', 4), (u'vary', 4), (u'waistband', 4), (u'zip', 4), (u'metal', 3), (u'slightly', 3), (u'stretch', 3), (u'woven', 3), (u'buckle', 2), (u'button', 2), (u'classic', 2), (u'crease', 2), (u'decorative', 2), (u'denim', 2), (u'elastane', 2), (u'fly', 2), (u'lining', 2), (u'loops', 2), (u'multicoloured', 2), (u'polyester', 2), (u'resistant', 2), (u'silk', 2), (u'smart', 2), (u'soft', 2), (u'according', 1), (u'acetate', 1), (u'beautiful', 1), (u'black', 1), (u'blend', 1), (u'brown', 1), (u'business', 1), (u'casual', 1), (u'centre', 1), (u'component', 1), (u'concealed', 1), (u'curvy', 1), (u'cut', 1), (u'details', 1), (u'diagonal', 1), (u'dividing', 1), (u'dots', 1), (u'double', 1), (u'drape', 1), (u'effects', 1), (u'embossed', 1), (u'excluding', 1), (u'extra', 1), (u'fasten

In [89]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer(norm="l2",use_idf=True)
tfidf.fit(tf_matrix)
print "***Vocabulary with inverse document frequency accounted for***"
print sorted(zip(count_vectorizer.get_feature_names(),tfidf.idf_), key = lambda x: x[1], reverse=True)

Vocabulary with inverse document frequency accounted for
[(u'according', 2.09861228866811), (u'acetate', 2.09861228866811), (u'beautiful', 2.09861228866811), (u'black', 2.09861228866811), (u'blend', 2.09861228866811), (u'brown', 2.09861228866811), (u'business', 2.09861228866811), (u'casual', 2.09861228866811), (u'centre', 2.09861228866811), (u'component', 2.09861228866811), (u'concealed', 2.09861228866811), (u'curvy', 2.09861228866811), (u'cut', 2.09861228866811), (u'denim', 2.09861228866811), (u'details', 2.09861228866811), (u'diagonal', 2.09861228866811), (u'dividing', 2.09861228866811), (u'dots', 2.09861228866811), (u'double', 2.09861228866811), (u'drape', 2.09861228866811), (u'effects', 2.09861228866811), (u'embossed', 2.09861228866811), (u'excluding', 2.09861228866811), (u'extra', 2.09861228866811), (u'fastening', 2.09861228866811), (u'featuring', 2.09861228866811), (u'finish', 2.09861228866811), (u'fit', 2.09861228866811), (u'five', 2.09861228866811), (u'fixed', 2.09861228866811)