In [2]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

## Reading the Data
MIND contains about 160k English news articles and more than 15 million impression logs generated by 1 million users. Every news article contains rich textual content including title, abstract, body, category and entities. 

In [3]:
with open('news.tsv', 'r') as file: 
    stop = 0
    df = pd.read_csv(file, delimiter='\t')
    print(df.head)

<bound method NDFrame.head of          N88753 lifestyle           lifestyleroyals  \
0        N45436      news  newsscienceandtechnology   
1        N23144    health                weightloss   
2        N86255    health                   medical   
3        N93187      news                 newsworld   
4        N75236    health                    voices   
...         ...       ...                       ...   
101521  N115249    sports               more_sports   
101522   N64337   finance       finance-real-estate   
101523  N100102    sports                soccer_epl   
101524   N74617     autos               autossports   
101525   N56840    sports               more_sports   

       The Brands Queen Elizabeth, Prince Charles, and Prince Philip Swear By  \
0         Walmart Slashes Prices on Last-Generation iPads                       
1                           50 Worst Habits For Belly Fat                       
2       Dispose of unwanted prescription drugs during ...         

In [18]:
df.iloc[0:10,4]

0    Apple's new iPad releases bring big deals on l...
1    These seemingly harmless habits are holding yo...
2                                                  NaN
3    Lt. Ivan Molchanets peeked over a parapet of s...
4    I felt like I was a fraud, and being an NBA wi...
5    They seem harmless, but there's a very good re...
6                                                  NaN
7    Several fines came down against NFL players fo...
8    The easiest way to record what's happening on ...
9    There won't be a chill down to your bones this...
Name: Shop the notebooks, jackets, and more that the royals can't live without., dtype: object

In [29]:
# We only want the item at index 3 of every list, as we're only interested in the text-body
df.iat[1,4]

'These seemingly harmless habits are holding you back and keeping you from shedding that unwanted belly fat for good.'

In [4]:
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
  """Removes HTML tags by replacing everywthing inside html tags with an empty space"""
  return TAG_RE.sub('', text)

In [5]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\elena\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [8]:
def preprocess_text(input):
  # we want everything to be lowercase
  input = input.lower()

  # remove any html tags
  input = remove_tags(input)

  # remove punctuations and numbers
  input = re.sub('[^a-zA-Z]', ' ', input)

  # remove single characters: Mark's -> Mark
  input = re.sub(r'\s+[a-zA-Z]\s', ' ', input)

  # remove multiple spaces
  input = re.sub(r'\s+', ' ', input)

  # remove stopwords
  pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
  input = pattern.sub('', input)

  return input.split()

In [24]:
df.iloc[:,2]

0         newsscienceandtechnology
1                       weightloss
2                          medical
3                        newsworld
4                           voices
                    ...           
101521                 more_sports
101522         finance-real-estate
101523                  soccer_epl
101524                 autossports
101525                 more_sports
Name: lifestyleroyals, Length: 101526, dtype: object

In [23]:
# I want to go through the third and fourth column of my df 
# and apply the preprocessing function to all rows of these columns. 
# Such that in the end I have a list with lists of the words of each cell.
processed_lists = df.iloc[:, 3:4].map(preprocess_text)

In [25]:
print(processed_lists[1:20])

   The Brands Queen Elizabeth, Prince Charles, and Prince Philip Swear By
1   <built-in method split of str object at 0x0000...                    
2   <built-in method split of str object at 0x0000...                    
3   <built-in method split of str object at 0x0000...                    
4   <built-in method split of str object at 0x0000...                    
5   <built-in method split of str object at 0x0000...                    
6   <built-in method split of str object at 0x0000...                    
7   <built-in method split of str object at 0x0000...                    
8   <built-in method split of str object at 0x0000...                    
9   <built-in method split of str object at 0x0000...                    
10  <built-in method split of str object at 0x0000...                    
11  <built-in method split of str object at 0x0000...                    
12  <built-in method split of str object at 0x0000...                    
13  <built-in method split of str obje

In [10]:
from gensim.models import Word2Vec
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\elena\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

## Word2Vec Model

In [None]:
model = Word2Vec(sentences=processed_lists, vector_size=100, window=5, min_count=5, sg=1, workers=4)

### Use Model to find similar words

In [None]:
# Use the model to find similar words
print(model.wv.most_similar('black', topn=5))