In [1]:
import pandas as pd

file = pd.read_csv('voted-kaggle-dataset.csv')
file.describe()

Unnamed: 0,Votes
count,2150.0
mean,24.011628
std,64.788465
min,2.0
25%,4.0
50%,8.0
75%,19.0
max,1241.0


In [2]:
print(file.head())

                         Title  \
0  Credit Card Fraud Detection   
1     European Soccer Database   
2      TMDB 5000 Movie Dataset   
3    Global Terrorism Database   
4      Bitcoin Historical Data   

                                            Subtitle  \
0  Anonymized credit card transactions labeled as...   
1  25k+ matches, players & teams attributes for E...   
2                Metadata on ~5,000 movies from TMDb   
3  More than 170,000 terrorist attacks worldwide,...   
4  Bitcoin data at 1-min intervals from select ex...   

                          Owner  Votes  \
0  Machine Learning Group - ULB   1241   
1                  Hugo Mathien   1046   
2     The Movie Database (TMDb)   1024   
3              START Consortium    789   
4                        Zielak    618   

                                            Versions  \
0          Version 2,2016-11-05|Version 1,2016-11-03   
1  Version 10,2016-10-24|Version 9,2016-10-24|Ver...   
2                               Versi

In [3]:
import re
import nltk
from nltk.tokenize import TreebankWordTokenizer
print('Analizing 1 sentence')
print(file.loc[0]['Description'])
tokenizer = TreebankWordTokenizer()
tokenized_words = tokenizer.tokenize(file.loc[0]['Description'])
print('Tokenized Words', tokenized_words)

Analizing 1 sentence
The datasets contains transactions made by credit cards in September 2013 by european cardholders. This dataset presents transactions that occurred in two days, where we have 492 frauds out of 284,807 transactions. The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions.
It contains only numerical input variables which are the result of a PCA transformation. Unfortunately, due to confidentiality issues, we cannot provide the original features and more background information about the data. Features V1, V2, ... V28 are the principal components obtained with PCA, the only features which have not been transformed with PCA are 'Time' and 'Amount'. Feature 'Time' contains the seconds elapsed between each transaction and the first transaction in the dataset. The feature 'Amount' is the transaction Amount, this feature can be used for example-dependant cost-senstive learning. Feature 'Class' is the response variable and it take

In [4]:
import nltk
from nltk.stem import WordNetLemmatizer

def find_pos(word):
    pos = nltk.pos_tag(nltk.word_tokenize(word))[0][1]
    if pos.lower()[0] == 'j':
        return 'a'
    # Adverb tags -'RB', 'RBR', 'RBS'
    elif pos.lower()[0] == 'r':
        return 'r'
    # Verb tags -'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'
    elif pos.lower()[0] == 'v':
        return 'v'
    # Noun tags -'NN', 'NNS', 'NNP', 'NNPS'
    else:
        return 'n'

def words_lemmatizer(text, encoding = 'utf-8'):
    words = nltk.word_tokenize(text)
    lemma_words = []
    wl= WordNetLemmatizer()
    for word in words:
        pos= find_pos(word)
        lemma_words.append(wl.lemmatize(word, pos))
    return " ".join(lemma_words)

In [5]:
from nltk.corpus import stopwords

def remove_stopwords(text, lang='english'):
    words = nltk.word_tokenize(text)
    lang_stopwords = stopwords.words(lang)
    stopwords_removed = [w for w in words if w.lower() not in lang_stopwords]
    return " ".join(stopwords_removed)

# Converting to lower case
def do_prepocessing(one_row):
    try:
        lower_text = one_row.lower()
        
        remove_unwanted_charectors = re.sub(r'[^a-zA-Z0-9_.:\s]', ' ', lower_text) # Remove unwated charectors like punctuations andnon ascii 
        remove_unwanted_charectors = re.sub(r'&[\w]+', ' ', remove_unwanted_charectors) # Remove &amp, *&words etc

        removed_extra_space = re.sub(r'\s+',' ', remove_unwanted_charectors) # Remove extra white_spaces

        removed_stopwords_text = remove_stopwords(removed_extra_space)
        lemmatize_text = words_lemmatizer(removed_stopwords_text)
        #print("count=", count, "# text = ", extract_has_joined, "Actual text ",lemmatize_text )
        return lemmatize_text
    except:
        return one_row


new_df = file['Description'].apply(do_prepocessing)
print(new_df.head())

0    datasets contains transaction make credit card...
1    ultimate soccer database data analysis machine...
2    background say success movie release certain c...
3    context information 170 000 terrorist attack g...
4    context bitcoin long run well know cryptocurre...
Name: Description, dtype: object


In [6]:
new_df.dropna(inplace=True, axis=0)
sum(new_df.isnull())

0

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')
tfidf.fit(new_df)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [8]:
from sklearn.decomposition import NMF,LatentDirichletAllocation
nmf_model = NMF(n_components=7,random_state=42)
nmf_model.fit(tfidf.transform(new_df))


NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
    n_components=7, random_state=42, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [9]:
LDA = LatentDirichletAllocation(n_components=7,random_state=42)
LDA.fit(tfidf.transform(new_df))

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=7, n_jobs=None,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [10]:
len(tfidf.get_feature_names())

31742

In [11]:
# words in NMF mdoeling

for index, topic in enumerate(nmf_model.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([tfidf.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['contains', 'service', 'airbnb', 'subset', 'house', 'extract', 'hotel', 'big', 'promptcloud', 'create', 'review', 'job', 'crawl', 'dataset', 'description']


THE TOP 15 WORDS FOR TOPIC #1
['contains', 'time', 'include', 'set', 'number', 'state', 'information', 'city', 'price', 'file', 'use', 'year', 'csv', 'dataset', 'data']


THE TOP 15 WORDS FOR TOPIC #2
['science', 'easy', 'thanks', 'past', 'community', 'citation', 'story', 'question', 'opportunity', 'acquire', 'attribution', 'answer', 'inside', 'data', 'owe']


THE TOP 15 WORDS FOR TOPIC #3
['imagenet', 'deep', 'convolutional', 'architecture', 'transferable', 'depth', 'residual', 'layer', 'network', 'learn', 'feature', 'image', 'pre', 'model', 'train']


THE TOP 15 WORDS FOR TOPIC #4
['sport', 'csv', 'win', 'stats', 'com', 'football', 'play', 'data', 'league', 'score', 'season', 'match', 'team', 'game', 'player']


THE TOP 15 WORDS FOR TOPIC #5
['id', 'article', 'file', 'user', 'sentiment', 'speech', 

In [12]:
# words for LDA modeling
for index, topic in enumerate(LDA.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([tfidf.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['simoiu', 'ramachandran', 'overgoor', 'corbett', 'anime', 'song', 'fivethirtyeight', 'officer', 'scorecard', 'reliably', 'police', 'stop', 'coco', 'jurisdiction', 'pesticide']


THE TOP 15 WORDS FOR TOPIC #1
['anime', 'powerball', 'wikihow', 'postal', 'en', 'le', 'minneapolis', 'nip', 'pollutant', 'zipcode', 'starbucks', 'epa', 'el', 'airbnb', 'openaddresses']


THE TOP 15 WORDS FOR TOPIC #2
['terror', 'crash', 'dry', 'voyant', 'pollster', 'marathon', 'slave', 'yelp', 'msa', 'gun', 'specie', 'shooting', 'tatoeba', 'pill', 'loan']


THE TOP 15 WORDS FOR TOPIC #3
['pharmacy', 'properati', 'liver', 'meteorite', 'homicide', 'song', 'murder', 'pump', 'victim', 'specie', 'ted', 'breach', 'uber', 'dataset', 'description']


THE TOP 15 WORDS FOR TOPIC #4
['plaque', 'toronto', 'nypd', 'podcast', 'glacier', 'densenet', 'marijuana', 'ufo', 'metacritic', 'squeezenet', 'bike', 'volcano', 'emoji', 'specie', 'thor']


THE TOP 15 WORDS FOR TOPIC #5
['image', 'information

In [13]:
topic_results = nmf_model.transform(tfidf.transform(new_df)).argmax(axis=1)
topic_results_lda = LDA.transform(tfidf.transform(new_df)).argmax(axis=1)

In [14]:
new_df.head()

0    datasets contains transaction make credit card...
1    ultimate soccer database data analysis machine...
2    background say success movie release certain c...
3    context information 170 000 terrorist attack g...
4    context bitcoin long run well know cryptocurre...
Name: Description, dtype: object

In [15]:
topic_results[:5]

array([3, 4, 1, 1, 1], dtype=int64)

In [16]:
topic_results_lda[:5]

array([5, 5, 5, 5, 5], dtype=int64)

In [17]:
# Removing the most common words will help