In [1]:
# import required module
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import pandas as pd
import re

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
def preprocess_text(text):
    # Remove numeric values
    text = re.sub(r'\b\d+\b', '', text)
    # Remove timestamps with various formats
    text = re.sub(r'\b\d{4}-\d{2}-\d{2}\b', '', text)  # Example: 2023-08-18 (YYYY-MM-DD)
    text = re.sub(r'\b\d{2}/\d{2}/\d{4}\b', '', text)   # Example: 08/18/2023 (MM/DD/YYYY)
    text = re.sub(r'\b\d{2}-\d{2}-\d{4}\b', '', text)   # Example: 08-18-2023 (MM-DD-YYYY)
    text = re.sub(r'\b\d{2}:\d{2}(:\d{2})?\b', '', text) # Example: 14:30 or 14:30:00
    text = re.sub(r'\b\d{2}:\d{2}:\d{2}\.\d{1,3}\b', '', text)  # Example: 14:30:00.123
    text = re.sub(r'\b\d{2}:\d{2}(:\d{2})?\b', '', text)  # Remove colon-separated times
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

In [3]:

df = pd.read_csv('../data/content_matching.csv',header=None)



In [4]:
#print(df.head())        # Display the first few rows of the DataFrame
print(df.columns)      # List all column names to find the text column
df.columns=['text_column']


Index([0], dtype='int64')


In [5]:
# Extract text data from the column
df['cleaned_text'] = df['text_column'].astype(str).apply(preprocess_text)
#text_data = df['text_column'].astype(str).tolist()
print(df['cleaned_text'])


0     : When there is a sexual interaction, or there...
1     Deja Blu & Andre Duqum Interview Sadhguru in t...
2     : how do we get out of this i'm not enough dra...
3     : this is why i said that simple process every...
4     I am advising everyone of you, keep a lamp in ...
                            ...                        
80    Sadhguru tells the difference between integrit...
81    he tells we need not to create a department in...
82    how to love themselves - To love or hate you n...
83    I find that there's people that don't have goo...
84    Do you think that emotional relationship will ...
Name: cleaned_text, Length: 85, dtype: object


In [6]:
df.to_csv('../data/cleaned_content.csv', index=False)

In [7]:
stop_words_list = list(ENGLISH_STOP_WORDS)
#keep ngram_range as only 2 as 1 and 2 will give duplicates
tfidf = TfidfVectorizer(ngram_range=(1, 2),stop_words=stop_words_list)

# Fit and transform the text data
tfidf_matrix = tfidf.fit_transform(df['cleaned_text'])

feature_names = tfidf.get_feature_names_out()

In [8]:
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

In [9]:
tfidf_df.to_csv('../data/feature_content.csv', index=False)

In [10]:



def get_significant_ngrams(row, top_n=10):
    scores = row.to_dict()

    sorted_ngrams = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
    return sorted_ngrams


In [11]:
def get_significant_words(row, top_n=10):
    # Get the TF-IDF scores for the document
    scores = row.to_dict()
    # Sort the words by score in descending order and get the top_n words
    sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
    return sorted_words

In [12]:
feature_names = tfidf.get_feature_names_out()



In [13]:
#Not in use, Just Trying out
unigrams = set()
bigrams = set()

# Identify unigrams and bigrams
for feature in feature_names:
    if len(feature.split()) <= 1:
        unigrams.add(feature)
    elif len(feature.split()) == 2:
        bigrams.add(feature)

# Create a set of unique terms (excluding unigrams that appear in bigrams)
unique_terms = bigrams.copy()  # Start with bigrams
for unigram in unigrams:
    if all(unigram not in bigram for bigram in bigrams):
        unique_terms.add(unigram)


In [14]:
#function to remove duplicates from unigram in bigram
def remove_substring_keys(input_dict):
    # Sort keys by length in descending order
    sorted_keys = sorted(input_dict.keys(), key=len, reverse=True)
    
    # Create a new dictionary to hold the result
    result_dict = {}
    
    # Iterate through sorted keys
    for key in sorted_keys:
        # Check if key is not a substring of any key already in the result_dict
        if not any(key in existing_key for existing_key in result_dict):
            result_dict[key] = input_dict[key]
    
    return result_dict





In [15]:
'''
for i, row in tfidf_df.iterrows():
    significant_ngrams = get_significant_ngrams(row,6)
    print(f"Document {i}:")
    for ngram, score in significant_ngrams:
        if ngram in unique_terms:
            print(f"  {ngram}: {score:.4f}")
    print("\n")
'''

'\nfor i, row in tfidf_df.iterrows():\n    significant_ngrams = get_significant_ngrams(row,6)\n    print(f"Document {i}:")\n    for ngram, score in significant_ngrams:\n        if ngram in unique_terms:\n            print(f"  {ngram}: {score:.4f}")\n    print("\n")\n'

In [16]:
for i, row in tfidf_df.iterrows():
    significant_ngrams = get_significant_ngrams(row,8)
    print(f"Row {i+1}:")
    ngram_dict=dict()
    for ngram, score in significant_ngrams:
        ngram_dict[ngram]=f"{score:.4f}"
    # Remove substring keys
    unique_ngram_dict = remove_substring_keys(ngram_dict)
    sorted_unique_ngram_dict = dict(sorted(unique_ngram_dict.items(), key=lambda item: item[1], reverse=True))

    #print(ngram_dict)
    for key,value in sorted_unique_ngram_dict.items():
        
        print(f"  {key}: {value}")
    df.at[i,"Keywords and Scores"]=str(sorted_unique_ngram_dict)
    print("\n")
    df.to_csv("../data/result_keywords.csv")

Row 1:
  simple possible: 0.1966
  body memory: 0.1966
  sexual: 0.1966
  question: 0.1799


Row 2:
  perception: 0.1905
  digestive process: 0.1573
  food remain: 0.1573
  half hours: 0.1573
  liquid: 0.1573
  bag: 0.1573


Row 3:
  thing happened: 0.2609
  human: 0.2375
  desire expand: 0.1957
  expand limitlessly: 0.1305


Row 4:
  engineering: 0.4300
  stuff: 0.1967
  trust: 0.1837
  saying: 0.1584
  angel comes: 0.1433


Row 5:
  lamp: 0.4174
  burning: 0.2087
  light: 0.1909
  home: 0.1606
  don want: 0.1538
  act particular: 0.1044
  advising: 0.1044


Row 6:
  everybody hurt: 0.3400
  hurt everybody: 0.1700
  horse knight: 0.1700
  hurt perfect: 0.1700
  human hurt: 0.1700


Row 7:
  somebody make: 0.3754
  angry somebody: 0.1368
  freak somebody: 0.1368
  decide happen: 0.1368
  choice way: 0.1368


Row 8:
  responsibility: 0.2210
  doesn matter: 0.1628
  work: 0.1628
  match idiots: 0.1473
  matched: 0.1473
  stars: 0.1473


Row 9:
  thing ve: 0.2899
  ve seen: 0.2899
  infra

In [54]:
#Execute this to find raw re4sults with duplicates of unigram in bigrams
'''
for i, row in tfidf_df.iterrows():
    significant_ngrams = get_significant_ngrams(row,3)
    print(f"Document {i}:")
    #ngram_dict=dict()
    for ngram, score in significant_ngrams:
        print(f"  {ngram}: {score:.4f}")
    print("\n")
'''

'\nfor i, row in tfidf_df.iterrows():\n    significant_ngrams = get_significant_ngrams(row,3)\n    print(f"Document {i}:")\n    #ngram_dict=dict()\n    for ngram, score in significant_ngrams:\n        print(f"  {ngram}: {score:.4f}")\n    print("\n")\n'