In [28]:
import pandas as pd
import os
from sklearn.feature_extraction.text import TfidfVectorizer 

In [4]:
# Get the absolute path to the data directory
abs_dir = os.path.abspath(os.path.dirname(os.getcwd()))

In [5]:
df = pd.read_csv(os.path.join(abs_dir, 'scripts/data/cbe_reviews_20250608_162756.csv'))

In [6]:
pd.set_option('display.max_colwidth', None)

df.head(5)

Unnamed: 0,review_text,rating,upvote,date,bank_name,source
0,really am happy to this app it is Siple to use everything,5,0,2025-06-07,Commercial Bank of Ethiopia,Google Play
1,I liked this app. But the User interface is very basic and not attractive at all.,2,0,2025-06-07,Commercial Bank of Ethiopia,Google Play
2,"""Why don’t your ATMs support account-to-account transfers like other countries( Kenya, Nigeria , South africa)""",4,0,2025-06-06,Commercial Bank of Ethiopia,Google Play
3,what is this app problem???,1,0,2025-06-05,Commercial Bank of Ethiopia,Google Play
4,the app is proactive and a good connections.,5,0,2025-06-05,Commercial Bank of Ethiopia,Google Play


In [7]:
# check types 
df.dtypes

review_text    object
rating          int64
upvote          int64
date           object
bank_name      object
source         object
dtype: object

In [8]:
df.describe()

Unnamed: 0,rating,upvote
count,4000.0,4000.0
mean,4.1105,8.48825
std,1.474562,82.261194
min,1.0,0.0
25%,4.0,0.0
50%,5.0,1.0
75%,5.0,1.0
max,5.0,3025.0


In [9]:
# check for null values in the DataFrame
df.isnull().sum()

review_text    0
rating         0
upvote         0
date           0
bank_name      0
source         0
dtype: int64

### Remove Duplicate Entries

When I check for duplicate entries in general, I find 81 duplicate rows. However, when I define duplicates based on both review_text and rating, the number increases to 1,162 rows. This indicates that the most influential factors in identifying duplicates are the review text and the rating.

In [10]:
# Check for duplicated data
dublicated_data = df.duplicated(subset=['review_text', 'rating'], keep=False).sum()
print(f'data duplicated: {dublicated_data}')
df[df.duplicated()]

data duplicated: 1162


Unnamed: 0,review_text,rating,upvote,date,bank_name,source
15,good,5,0,2025-06-04,Commercial Bank of Ethiopia,Google Play
74,good,5,0,2025-05-23,Commercial Bank of Ethiopia,Google Play
76,good,5,0,2025-05-23,Commercial Bank of Ethiopia,Google Play
86,ok,5,0,2025-05-22,Commercial Bank of Ethiopia,Google Play
103,best,5,0,2025-05-21,Commercial Bank of Ethiopia,Google Play
...,...,...,...,...,...,...
3381,Good,5,1,2024-02-11,Commercial Bank of Ethiopia,Google Play
3396,Good,5,1,2024-02-11,Commercial Bank of Ethiopia,Google Play
3409,Good,5,1,2024-02-09,Commercial Bank of Ethiopia,Google Play
3641,Good,5,1,2024-01-05,Commercial Bank of Ethiopia,Google Play


In [11]:
# remove duplicated data
df.drop_duplicates(subset=['review_text', 'rating'], keep='first', inplace=True)

In [12]:
# change date format from object to 'YYYY-MM-DD' format
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
df.dtypes

review_text            object
rating                  int64
upvote                  int64
date           datetime64[ns]
bank_name              object
source                 object
dtype: object

In [13]:
df.shape

(2988, 6)

In [17]:
!pip install transformers #distilBERT-base-uncased, which helps to classify the reviews
!pip install huggingface_hub[hf_xet] # for downloading the model from Hugging Face, for the purpose of sentiment analysis


Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: C:\Users\elsha\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip
ERROR: Invalid requirement: '#distilBERT-base-uncased,': Expected package name at the start of dependency specifier
    #distilBERT-base-uncased,
    ^


Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: C:\Users\elsha\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip
ERROR: Invalid requirement: '#': Expected package name at the start of dependency specifier
    #
    ^


In [16]:
from transformers import pipeline

# Load the sentiment analysis pipeline,  creates a pipeline for a specific NLP task. In this case, the task is sentiment analysis.
sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english") 
#pipline is used to perform sentiment analysis on the review_text column

# Apply sentiment to review_text column
df['sentiment_result'] = df['review_text'].astype(str).apply(lambda x: sentiment_pipeline(x)[0]) 
#the above line applies the sentiment analysis pipeline to each review text and stores the result in a new column 'sentiment_result'
#astype(str) ensures that the review_text is treated as a string, even if it contains null values or other types.
#sentiment_pipeline(x)[0] returns the first result of the sentiment analysis, which is a dictionary containing the sentiment label and score.

df['sentiment_label'] = df['sentiment_result'].apply(lambda x: x['label'])
#the above line extracts the sentiment label (e.g., 'POSITIVE' or 'NEGATIVE') from the sentiment result and stores it in a new column 'sentiment_label'
#apply(lambda x: x['label']) is used to extract the label from each sentiment result.

df['sentiment_score'] = df['sentiment_result'].apply(lambda x: x['score'])
#the above line extracts the sentiment score (a numerical value indicating the confidence of the sentiment) from the sentiment result and stores it in a new column 'sentiment_score'

print(df[['review_text', 'sentiment_label', 'sentiment_score',"sentiment_result"]].head(5))


Device set to use cpu


                                                                                                       review_text  \
0                                                        really am happy to this app it is Siple to use everything   
1                                I liked this app. But the User interface is very basic and not attractive at all.   
2  "Why don’t your ATMs support account-to-account transfers like other countries( Kenya, Nigeria , South africa)"   
3                                                                                      what is this app problem???   
4                                                                     the app is proactive and a good connections.   

  sentiment_label  sentiment_score  \
0        POSITIVE         0.998870   
1        NEGATIVE         0.999684   
2        NEGATIVE         0.996465   
3        NEGATIVE         0.999623   
4        POSITIVE         0.999868   

                                     sentiment_result  
0  {'l

In [23]:
agg_mean = df.groupby(["bank_name", "rating"])["sentiment_score"].mean().reset_index()
#this line groups the DataFrame by 'bank_name' and 'rating', calculates the mean sentiment score for each group, and resets the index to create a new DataFrame with the results.
#reset_index() is used to convert the grouped DataFrame back into a regular DataFrame format.

print(arg_mean.head(5))

                     bank_name  rating  sentiment_score
0  Commercial Bank of Ethiopia       1         0.976435
1  Commercial Bank of Ethiopia       2         0.963838
2  Commercial Bank of Ethiopia       3         0.959347
3  Commercial Bank of Ethiopia       4         0.953669
4  Commercial Bank of Ethiopia       5         0.960054


In [27]:
# Rename the column for clarity (optional)
agg_mean.rename(columns={'sentiment_score': 'avg_sentiment_score'}, inplace=True)
#this line renames the 'sentiment_score' column to 'avg_sentiment_score' for clarity, indicating that it represents the average sentiment score for each group.

# agg_df = agg_mean.rename(columns={'sentiment_score': 'average_sentiment_score'}, inplace=True)
print(arg_mean.head(5))

                     bank_name  rating  sentiment_score
0  Commercial Bank of Ethiopia       1         0.976435
1  Commercial Bank of Ethiopia       2         0.963838
2  Commercial Bank of Ethiopia       3         0.959347
3  Commercial Bank of Ethiopia       4         0.953669
4  Commercial Bank of Ethiopia       5         0.960054


## Thematic Analysis
A theme refers to a recurring concept or topic within user reviews. For this challenge, themes will help summarize user feedback into actionable categories for the banks.

### Keyword Extraction & Manual/Rule-Based Clustering

In [35]:
verctorizer = TfidfVectorizer(max_features=100)

# Fit and transform the review_text column to create a TF-IDF matrix

tfidf_verctorized = verctorizer.fit_transform(df['review_text'].astype(str))

keywords = verctorizer.get_feature_names_out()

print(f"Number of keywords: {len(keywords)}")

print(f"Keywords: {keywords[:20]}")  # Display the first 10 keywords


Number of keywords: 100
Keywords: ['account' 'after' 'all' 'am' 'amazing' 'an' 'and' 'app' 'application'
 'apps' 'are' 'as' 'at' 'bank' 'banking' 'be' 'best' 'but' 'by' 'can']


In [54]:


bank_keywords = {}
# grouped_by_bank = df.groupby('bank_name')
# print(" grouped by bank name  ",grouped_by_bank) # this line counts the number of reviews for each bank and displays the result.



for bank, group in df.groupby('bank_name'): # grouping the DataFrame by 'bank_name', output is a dictionary-like object where each key is a bank name and each value is a DataFrame containing the reviews for that bank.

    vectorizer = TfidfVectorizer(max_features=100, stop_words='english')

    print("bank", bank)  # Display the bank name and its corresponding group of reviews
    #print( "group", group["review_text"].head(5))  # Display the first 5 reviews for the current bank

    tfidf_matrix = vectorizer.fit_transform(group['review_text'].astype(str)) # this line fits the TF-IDF vectorizer to the review_text of each bank and transforms it into a TF-IDF matrix.

    keywords = vectorizer.get_feature_names_out()
    # print("keywords for bank:", bank, "->", keywords)  # Display the keywords for each bank
    
    bank_keywords[bank] = keywords # Store the keywords for each bank in the dictionary,eg. CBE:{keyword1, keyword2, ...}

    print("bank_keywords:", bank_keywords)  # Display the dictionary containing keywords for each bank




bank Commercial Bank of Ethiopia
bank_keywords: {'Commercial Bank of Ethiopia': array(['access', 'account', 'add', 'amazing', 'app', 'application',
       'apps', 'available', 'bad', 'balance', 'bank', 'banking', 'banks',
       'best', 'better', 'birr', 'branch', 'cbe', 'commercial',
       'customer', 'days', 'developer', 'doesn', 'don', 'easy', 'error',
       'ethiopia', 'excellent', 'experience', 'fast', 'fix', 'friendly',
       'good', 'great', 'help', 'history', 'important', 'interesting',
       'issue', 'just', 'life', 'like', 'love', 'make', 'message',
       'mobile', 'mode', 'money', 'need', 'needs', 'network', 'new',
       'nice', 'ok', 'option', 'options', 'payment', 'phone', 'poor',
       'problem', 'properly', 'really', 'reason', 'receipt', 'recent',
       'reliable', 'says', 'screenshot', 'security', 'seen', 'send',
       'service', 'simple', 'statement', 'telebirr', 'thank', 'thanks',
       'thing', 'time', 'times', 'transaction', 'transactions',
       'transfe

## Manual Grouping 

In [56]:
theme_counts = themes = {
    'Account Access Issues': ['login', 'access', 'account', 'password', 'blocked', 'error'],
    'Transaction Performance': ['transaction', 'delay', 'payment', 'processing', 'funds', 'failed'],
    'User Interface & Experience': ['app', 'interface', 'design', 'navigation', 'crash', 'slow'],
    'Customer Support': ['support', 'agent', 'response', 'feedback', 'service'],
    'Feature Requests': ['feature', 'add', 'need', 'want', 'update', 'new']
}

bank_theme_mapping = {}

for bank, keywords in bank_keywords.items():
    theme_counts = {theme: 0 for theme in themes}
    
    for word in keywords:
        for theme, word_list in themes.items():
            if word in word_list:
                theme_counts[theme] += 1
    
    # Sort and keep top 3–5 themes
    sorted_themes = sorted(theme_counts.items(), key=lambda x: x[1], reverse=True) #x[1] is used to sort the themes based on their counts in descending order.
    bank_theme_mapping[bank] = [theme for theme, count in sorted_themes if count > 0][:5]

# Display
for bank, top_themes in bank_theme_mapping.items():
    print(f"{bank}: {top_themes}")

Commercial Bank of Ethiopia: ['Feature Requests', 'Account Access Issues', 'Transaction Performance', 'User Interface & Experience', 'Customer Support']



## 🚀 Alternative Approach: Use Embedding Similarity (with spaCy)
This approach:

Uses word embeddings to measure similarity between keywords and theme labels.

Dynamically assigns keywords to themes without hardcoding word lists.

Works well when keywords are varied or noisy.

pip install spacy
python -m spacy download en_core_web_md 

## 🧠 What is spaCy?

spaCy is a popular open-source Natural Language Processing (NLP) library in Python.

✅ It provides:
Tokenization (splitting sentences into words)

Part-of-speech tagging (e.g., noun, verb)

Named entity recognition (e.g., people, places)

Word vectors / embeddings (words as mathematical vectors for similarity)

Dependency parsing (grammar structure of a sentence)

🔍 Why is spaCy useful?
You can compare words by meaning using pre-trained word vectors.

It allows semantic similarity, like matching "login" with "access issue".

But spaCy is not required for everything — you can do topic grouping without it.

In [57]:
!pip install spacy #this library is used for natural language processing tasks, such as tokenization, part-of-speech tagging, and named entity recognition.

Defaulting to user installation because normal site-packages is not writeable
Collecting spacy
  Downloading spacy-3.8.7-cp312-cp312-win_amd64.whl.metadata (28 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.13-cp312-cp312-win_amd64.whl.metadata (2.2 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.11-cp312-cp312-win_amd64.whl.metadata (8.8 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.10-cp312-cp312-win_amd64.whl.metadata (2.5 kB)
Collecting thinc<8.4.0,>=8.3.4 (from spacy)
  Downloading thinc-8.3.6-cp312-cp312-win_amd64.whl.metadata (15 kB)
Collecting wasabi<1.2.0,>=0.9.1 (from spacy)
  Downloading wasabi-1.1.3-py3-none-any.whl.metadata (28 kB)
Collecting s

  You can safely remove it manually.
  You can safely remove it manually.

[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: C:\Users\elsha\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [None]:
python -m spacy download en_core_web_md # this line downloads the English language model for spaCy

In [None]:
import spacy
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

# Load spaCy model with word vectors
nlp = spacy.load("en_core_web_md")

# Define theme labels (you can adjust these)
themes = ['Account Access Issues', 'Transaction Performance', 'User Interface & Experience', 'Customer Support', 'Feature Requests']

# Embed themes
theme_vectors = {theme: nlp(theme).vector for theme in themes}

bank_theme_mapping = {}

for bank, group in df.groupby('bank_name'):
    # TF-IDF vectorization for current bank
    vectorizer = TfidfVectorizer(max_features=100, stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(group['review_text'].astype(str))
    keywords = vectorizer.get_feature_names_out()
    
    # Assign each keyword to the most similar theme
    theme_count = {theme: 0 for theme in themes}
    
    for kw in keywords:
        kw_vec = nlp(kw).vector
        if np.linalg.norm(kw_vec) == 0:
            continue  # skip empty vectors (rare but possible)
        
        # Compute cosine similarity
        similarities = {theme: np.dot(kw_vec, vec) / (np.linalg.norm(vec) * np.linalg.norm(kw_vec)) for theme, vec in theme_vectors.items()}
        best_theme = max(similarities, key=similarities.get)
        theme_count[best_theme] += 1
    
    # Get top 3–5 themes
    sorted_themes = sorted(theme_count.items(), key=lambda x: x[1], reverse=True)
    bank_theme_mapping[bank] = [theme for theme, count in sorted_themes if count > 0][:5]

# Output themes per bank
for bank, top_themes in bank_theme_mapping.items():
    print(f"{bank}: {top_themes}")


## Pipeline

Script preprocessing (tokenization, stop-word removal, lemmatization if useful) with Pandas and NLP libraries.

In [63]:
#import dependencies
import pandas as pd
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize #for text preprocessing, which includes tokenization, stopword removal, and lemmatization
from nltk.stem import WordNetLemmatizer #for text preprocessing, which includes tokenization, stopword removal, and lemmatization
 
#sentiment analysis
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer #for sentiment analysis
from sklearn.feature_extraction.text import TfidfVectorizer #for text vectorization, means converting text data into numerical format that machine learning algorithms can understand, what TF-IDF does is that it takes into account both the frequency of words and their importance in the context of the document, which can help the model understand the meaning and relevance of words in the text.
from sklearn.model_selection import train_test_split #for splitting the dataset into training and testing sets
from sklearn.naive_bayes import MultinomialNB #for building a Naive Bayes classifier, which is a simple and effective algorithm for text classification tasks
from sklearn.metrics import accuracy_score

#keyword Extraction
from sklearn.feature_extraction.text import TfidfVectorizer

#visuals
import matplotlib.pyplot as plt
from wordcloud import WordCloud #for visualizing text data

In [71]:
# Download NLTK resources
nltk.download('punkt') # for tokenization, means splitting text into words
nltk.download('stopwords') # for removing common words like 'the', 'is', etc.
nltk.download('averaged_perceptron_tagger') # for part-of-speech tagging, meaning identifying the grammatical category of words
nltk.download('vader_lexicon') # for sentiment analysis using VADER
nltk.download('wordnet') # for lemmatization, which is reducing words to their base form, eg. 'running' to 'run'
nltk.download('omw-1.4') # for lemmatization with WordNet, which is a lexical database for the English language
nltk.download('punkt') # for tokenization, which is splitting text into sentences or words
nltk.download('punkt_tab') # for tokenization with tab-separated values, which is useful for processing text data in a tabular format

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\elsha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\elsha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\elsha\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\elsha\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\elsha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\elsha\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloadin

True

In [73]:
# def preprocess_text(text):
#     # Tokenization
#     tokens = word_tokenize(text.lower())  # Convert to lowercase and tokenize
#     # Remove stopwords
#     lematize = WordNetLemmatizer()

#     stop_words = set(stopwords.words('english'))

#     tokens = [lematize.lemmatize(token) for token in tokens if token.isalpha() and token not in stop_words]

#     return ' '.join(tokens)  # Join tokens back into a single string


# df["processed_review_text"] = df["review_text"].apply(preprocess_text)

# Preprocessing function

# This function takes a text input, tokenizes it, removes stopwords, and lemmatizes the words.

def preprocess_text(text):
    tokens = word_tokenize(text.lower()) # Tokenize and convert to lowercase
    lemmatizer = WordNetLemmatizer() # Initialize lemmatizer, which reduces words to their base form
    stop_words = set(stopwords.words('english')) # Set of common words to remove, like 'the', 'is', etc.
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalnum() and word not in stop_words] # Remove non-alphanumeric tokens and stopwords
    return ' '.join(tokens) # Join tokens back into a single string

# Apply preprocessing       
df['processed_review'] = df['review_text'].apply(preprocess_text) # Display the first few rows of the processed reviews

print(df['processed_review'].head(5)) # Display the first few rows of the processed reviews


0                           really happy app siple use everything
1                       liked app user interface basic attractive
2    atm support transfer like country kenya nigeria south africa
3                                                     app problem
4                                   app proactive good connection
Name: processed_review, dtype: object


In [78]:
# Create a new column named identified_theme
df['identified_theme'] = df['bank_name'].apply(lambda x: ', '.join(bank_theme_mapping[x])) # Apply the lambda function to create the identified_theme column,
#bank_theme_mapping[x]): this retrieves the top themes for each bank from the bank_theme_mapping dictionary and joins them into a single string separated by commas.
#bank theme mapping is a dictionary where each key is a bank name and each value is a list of identified themes for that bank.


# Define the columns you want to save
columns_to_save = ['review_text', 'sentiment_label', 'sentiment_score', 'identified_theme']

# Save the results to a CSV file
df[columns_to_save].to_csv('results.csv', index=False)

df['identified_theme'].head(5)  # Display the first few rows of the identified themes

0    Feature Requests, Account Access Issues, Transaction Performance, User Interface & Experience, Customer Support
1    Feature Requests, Account Access Issues, Transaction Performance, User Interface & Experience, Customer Support
2    Feature Requests, Account Access Issues, Transaction Performance, User Interface & Experience, Customer Support
3    Feature Requests, Account Access Issues, Transaction Performance, User Interface & Experience, Customer Support
4    Feature Requests, Account Access Issues, Transaction Performance, User Interface & Experience, Customer Support
Name: identified_theme, dtype: object

In [91]:
# Vectorize the dataset

df_results = pd.read_csv('results.csv')  # Read the results CSV file

vectorizer = TfidfVectorizer(max_features=100)
X = vectorizer.fit_transform(df_results['review_text'].astype(str))  # Convert review_text to string and fit the vectorizer

# Get top keywords
keywords = vectorizer.get_feature_names_out()
print("Top Keywords:", keywords)




Top Keywords: ['account' 'after' 'all' 'am' 'amazing' 'an' 'and' 'app' 'application'
 'apps' 'are' 'as' 'at' 'bank' 'banking' 'be' 'best' 'but' 'by' 'can'
 'cbe' 'developer' 'do' 'doesn' 'don' 'easy' 'ethiopia' 'even' 'ever'
 'every' 'fast' 'fix' 'for' 'from' 'get' 'good' 'great' 'has' 'have' 'if'
 'in' 'is' 'it' 'its' 'like' 'make' 'me' 'mobile' 'money' 'more' 'my'
 'need' 'new' 'nice' 'no' 'not' 'of' 'on' 'one' 'only' 'option' 'or'
 'other' 'please' 'problem' 'really' 'service' 'simple' 'so' 'some'
 'sometimes' 'system' 'thank' 'thanks' 'that' 'the' 'there' 'this' 'time'
 'to' 'transaction' 'transactions' 'transfer' 'up' 'update' 'use' 'using'
 'very' 'was' 'we' 'what' 'when' 'why' 'with' 'work' 'working' 'wow' 'you'
 'your' 'ነው']


## Cluster into 3–5 themes per bank (e.g., UI, reliability).

In [94]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer