In [8]:
import pandas as pd

# read json into a dataframe
df_idf=pd.read_json("stackoverflow-data-idf.json",lines=True)

# print schema
print("Schema:\n\n",df_idf.dtypes)
print("Number of questions,columns=",df_idf.shape)

Schema:

 id                            int64
title                        object
body                         object
answer_count                  int64
comment_count                 int64
creation_date                object
last_activity_date           object
last_editor_display_name     object
owner_display_name           object
owner_user_id               float64
post_type_id                  int64
score                         int64
tags                         object
view_count                    int64
accepted_answer_id          float64
favorite_count              float64
last_edit_date               object
last_editor_user_id         float64
community_owned_date         object
dtype: object
Number of questions,columns= (20502, 19)


In [16]:
df_idf.head(3)

Unnamed: 0,id,title,body,answer_count,comment_count,creation_date,last_activity_date,last_editor_display_name,owner_display_name,owner_user_id,post_type_id,score,tags,view_count,accepted_answer_id,favorite_count,last_edit_date,last_editor_user_id,community_owned_date
0,4821394,Serializing a private struct - Can it be done?,<p>I have a public class that contains a priva...,1,0,2011-01-27 20:19:13.563 UTC,2011-01-27 20:21:37.59 UTC,,,163534.0,1,0,c#|serialization|xml-serialization,296,,,,,
1,3367882,How do I prevent floated-right content from ov...,<p>I have the following HTML:</p>\n\n<pre><cod...,2,2,2010-07-30 00:01:50.9 UTC,2012-05-10 14:16:05.143 UTC,,,1190.0,1,2,css|overflow|css-float|crop,4121,3367943.0,0.0,2012-05-10 14:16:05.143 UTC,44390.0,
2,31682135,Gradle command line,<p>I'm trying to run a shell script with gradl...,0,2,2015-07-28 16:30:18.28 UTC,2015-07-28 16:32:15.117 UTC,,,1299158.0,1,1,bash|shell|android-studio|gradle,259,,,,,


We see that there are other metadata that are not relevant for this analysis. We will only be using ```body``` and ```title``` for this project



In [46]:
import re

def pre_process(text):
  # Convert the entire text corpus to lower case
  text = text.lower()

  #Removing tags
  text = re.sub("</?.*?>", " <> ", text)

  # Remove special characters and digits
  text = re.sub("(\\d|\\W)+", " ", text)

  return text

df_idf['text'] = df_idf['title'] + df_idf['body']
df_idf['text'] = df_idf['text'].apply(lambda x: pre_process(x))

In [47]:
# Show the first text
df_idf['text'][0]

'serializing a private struct can it be done i have a public class that contains a private struct the struct contains properties mostly string that i want to serialize when i attempt to serialize the struct and stream it to disk using xmlserializer i get an error saying only public types can be serialized i don t need and don t want this struct to be public is there a way i can serialize it and keep it private '

Use a CountVectorizer to create vocabulary from all the text in ```df_idf['text']```

A CountVectorizer transforms a given text into a vector on the basis of the frequency (count) of each word that occurs in the entire text

In [48]:
from sklearn.feature_extraction.text import CountVectorizer

def get_stop_words(stop_file_path):
    """load stop words """

    with open(stop_file_path, 'r', encoding="utf-8") as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
        return frozenset(stop_set)

# Load a set of stop words
# Sourced from https://github.com/kavgan/nlp-in-practice/blob/master/tf-idf/Keyword%20Extraction%20with%20TF-IDF%20and%20SKlearn.ipynb
stopwords = get_stop_words("stopwords.txt")
stopwords = list(stopwords)

In [49]:
# Get the text column
docs = df_idf['text'].tolist()

# Create a vocabulary of words,
# Ignore words that appear in 85% of documents,
# Eliminate stop words
cv = CountVectorizer(max_df=0.85,stop_words=stopwords)
word_count_vector=cv.fit_transform(docs)

Identifying the shape of the CountVectorizer

In [50]:
word_count_vector.shape

(20502, 124926)

20502 documents in our dataset (the rows)

Vocabulary size is 149391 meaning we have 149391 unique words (the columns) in our dataset minus the stopwords

We should use the min_df parameter to limit the size of the vocabulary because rare words don't help the model and they often cause overfitting

In [51]:
cv = CountVectorizer(min_df = 0.01, max_df = 0.95, stop_words = stopwords, max_features = 10000) # Limit our vocabulary size to 10000
word_count_vector = cv.fit_transform(docs)
print(word_count_vector.shape)
list(cv.vocabulary_.keys())[:10]

(20502, 879)


['private',
 'struct',
 'public',
 'class',
 'contains',
 'properties',
 'string',
 'attempt',
 'stream',
 'using']

#TfidfTransformer to Compute Inverse Document Frequency (IDF)


Taking the sparse matrix from CountVectorizer to generate the IDF when you invoke fit

TF-IDF Vectorizer is a measure of originality of a word by comparing the number of times a word appears in document with the number of documents the word appears in

In [52]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

In [53]:
tfidf_transformer.idf_ # Examining the IDF values

array([3.5568372 , 3.81031029, 5.2828796 , 5.19822671, 4.69784505,
       4.25124303, 5.1600055 , 4.91705932, 4.99873735, 4.56013931,
       3.04525714, 4.13598207, 4.43760296, 5.32252443, 4.56702402,
       5.26536602, 4.03873519, 5.44352956, 4.65356447, 4.8483933 ,
       5.29353689, 4.7755938 , 2.10910464, 5.23123301, 3.33193419,
       3.82336105, 5.10232639, 4.43004435, 5.38706295, 5.5956077 ,
       4.94944073, 3.91351214, 3.17298768, 4.98552712, 5.07039334,
       4.79492845, 3.14219006, 5.58121897, 5.03117263, 5.34859667,
       4.04794241, 4.81243437, 4.86454129, 5.21459369, 5.41893816,
       3.58813966, 5.52115472, 5.47300538, 4.78199724, 5.49898087,
       5.49024719, 5.37925041, 5.43526505, 4.87154248, 5.49024719,
       5.56235048, 4.60576126, 4.99079029, 4.70575023, 5.23796704,
       4.1866258 , 5.46449469, 4.71772642, 4.74211787, 4.01957172,
       4.92938993, 4.67449768, 5.27935226, 5.30070538, 3.37713963,
       5.03117263, 5.2828796 , 5.56235048, 4.46529704, 5.48158

# Computing TF-IDF and Extracting Keywords
Using questions in ```stackoverflow-test.json```


*   Compute TF-IDF
*   Extract the top keywords


In [54]:
df_test = pd.read_json("stackoverflow-test.json",lines=True)

df_test.head(3)

Unnamed: 0,id,title,body,accepted_answer_id,answer_count,comment_count,creation_date,last_activity_date,last_edit_date,last_editor_display_name,last_editor_user_id,owner_display_name,owner_user_id,post_type_id,score,tags,view_count,favorite_count
0,3247246,Integrate War-Plugin for m2eclipse into Eclips...,<p>I set up a small web project with JSF and M...,3247526.0,2,0,2010-07-14 14:39:48.053 UTC,2010-07-14 16:02:19.683 UTC,2010-07-14 15:56:37.803 UTC,,70604.0,,389430.0,1,2,eclipse|maven-2|tomcat|m2eclipse,1653,
1,40270764,phantomjs-node page.evaulate seems to hang,<p>I have an implementation of 'waitfor' with ...,,1,0,2016-10-26 19:35:00.537 UTC,2016-11-02 20:05:09.143 UTC,,,,,245076.0,1,0,node.js|phantomjs,35,
2,27532383,Dynamic operations can only be performed in ho...,<p>I'm working with an API that requires:</p>\...,,1,0,2014-12-17 18:31:18.6 UTC,2014-12-17 19:57:43.443 UTC,,,,,3105880.0,1,1,c#|asp.net-mvc,4372,


In [58]:
# Concatenate title and body

df_test['text'] = df_test['title'] + df_test['body']
df_test['text'] = df_test['text'].apply(lambda x: pre_process(x))

# Get test docs into a list
docs_test=df_test['text'].tolist()
docs_title=df_test['title'].tolist()
docs_body=df_test['body'].tolist()

In [59]:
# Sorts the values in the vector while preserving the column index
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

# Get the feature names and tf-idf score of top n items
def extract_topn_from_vector(feature_names, sorted_items, topn=10):

    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []

    for idx, score in sorted_items:
        fname = feature_names[idx]

        # Keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    # Create a tuples of feature,score
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]

    return results

Computing the TF-IDF value for a given document in our test set

*   Using tfidf_transformer.transform(...) to generate a vector of TF-IDF scores
*   Sort the words in the vector in descending order of TF-IDF values and then iterate over to extract the top-n items with the corresponding feature names

In [60]:
feature_names = cv.get_feature_names_out()

# get the document that we want to extract keywords from
doc=docs_test[0]

#generate tf-idf for the given document
tf_idf_vector=tfidf_transformer.transform(cv.transform([doc]))

#sort the tf-idf vectors by descending order of scores
sorted_items=sort_coo(tf_idf_vector.tocoo())

#extract only the top n; n here is 10
keywords=extract_topn_from_vector(feature_names,sorted_items,10)

# now print the results
print("\n=====Title=====")
print(docs_title[0])
print("\n=====Body=====")
print(docs_body[0])
print("\n===Keywords===")
for k in keywords:
    print(k,keywords[k])


=====Title=====
Integrate War-Plugin for m2eclipse into Eclipse Project

=====Body=====
<p>I set up a small web project with JSF and Maven. Now I want to deploy on a Tomcat server. Is there a possibility to automate that like a button in Eclipse that automatically deploys the project to Tomcat?</p>

<p>I read about a the <a href="http://maven.apache.org/plugins/maven-war-plugin/" rel="nofollow noreferrer">Maven War Plugin</a> but I couldn't find a tutorial how to integrate that into my process (eclipse/m2eclipse).</p>

<p>Can you link me to help or try to explain it. Thanks.</p>

===Keywords===
eclipse 0.789
project 0.318
plugin 0.286
tutorial 0.158
couldn 0.153
explain 0.147
automatically 0.147
process 0.122
link 0.116
web 0.111


In [62]:
# Put the common code into several methods
def get_keywords(idx):

    # Generate tf-idf for the given document
    tf_idf_vector=tfidf_transformer.transform(cv.transform([docs_test[idx]]))

    # Sort the tf-idf vectors by descending order of scores
    sorted_items=sort_coo(tf_idf_vector.tocoo())

    # Extract only the top n; n here is 10
    keywords=extract_topn_from_vector(feature_names,sorted_items,10)

    return keywords

def print_results(idx,keywords):
    # Now print the results
    print("\n=====Title=====")
    print(docs_title[idx])
    print("\n=====Body=====")
    print(docs_body[idx])
    print("\n===Keywords===")
    for k in keywords:
        print(k,keywords[k])

In [63]:
idx=120
keywords=get_keywords(idx)
print_results(idx,keywords)


=====Title=====
SQL Import Wizard - Error

=====Body=====
<p>I have a CSV file that I'm trying to import into SQL Management Server Studio.</p>

<p>In Excel, the column giving me trouble looks like this:
<a href="https://i.stack.imgur.com/pm0uS.png" rel="nofollow noreferrer"><img src="https://i.stack.imgur.com/pm0uS.png" alt="enter image description here"></a></p>

<p>Tasks > import data > Flat Source File > select file</p>

<p><a href="https://i.stack.imgur.com/G4b6I.png" rel="nofollow noreferrer"><img src="https://i.stack.imgur.com/G4b6I.png" alt="enter image description here"></a></p>

<p>I set the data type for this column to DT_NUMERIC, adjust the DataScale to 2 in order to get 2 decimal places, but when I click over to Preview, I see that it's clearly not recognizing the numbers appropriately:</p>

<p><a href="https://i.stack.imgur.com/NZhiQ.png" rel="nofollow noreferrer"><img src="https://i.stack.imgur.com/NZhiQ.png" alt="enter image description here"></a></p>

<p>The column ma

# Generate keywords for a batch of documents

In [64]:
# Generate tf-idf for all documents in your list. docs_test has 500 documents

tf_idf_vector=tfidf_transformer.transform(cv.transform(docs_test))

results=[]
for i in range(tf_idf_vector.shape[0]):

    # Get vector for a single document
    curr_vector=tf_idf_vector[i]

    # Sort the tf-idf vector by descending order of scores
    sorted_items=sort_coo(curr_vector.tocoo())

    # Extract only the top n; n here is 10
    keywords=extract_topn_from_vector(feature_names,sorted_items,10)


    results.append(keywords)

df=pd.DataFrame(zip(docs,results),columns=['doc','keywords'])
df

Unnamed: 0,doc,keywords
0,serializing a private struct can it be done i ...,"{'eclipse': 0.789, 'project': 0.318, 'plugin':..."
1,how do i prevent floated right content from ov...,"{'content': 0.516, 'console': 0.369, 'log': 0...."
2,gradle command line i m trying to run a shell ...,"{'dynamic': 0.505, 'vs': 0.504, 'net': 0.247, ..."
3,loop variable as parameter in asynchronous fun...,"{'image': 0.442, 'jpg': 0.43, 'background': 0...."
4,canot get the href value hi i need to valid th...,"{'intent': 0.55, 'share': 0.492, 'file': 0.291..."
...,...,...
495,missing routes in rails after using resource k...,"{'format': 0.43, 'net': 0.391, 'string': 0.29,..."
496,recenter cursor in middle of screen when editi...,"{'node': 0.616, 'null': 0.328, 'response': 0.3..."
497,openssl verify peer client certificate in c i ...,"{'step': 0.552, 'idea': 0.292, 'start': 0.274,..."
498,more than one instances of block for different...,"{'length': 0.663, 'list': 0.444, 'variable': 0..."
