In [158]:
import os
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [159]:
# Load Dataset
documents_list = []
with open( os.path.join("articles+4.txt") ,"r",  encoding='utf-8') as fin:
    for line in fin.readlines():
        try:
            text = line.strip()
            documents_list.append(text)
        except:
            pass


In [160]:
documents_list

['Смартфон (англ. smartphone — умный телефон) — мобильный телефон (современный — как правило, с сенсорным экраном), дополненный функциональностью умного устройства. Также коммуникатор (англ. communicator) — изначально карманный персональный компьютер, дополненный функциональностью мобильного телефона. В настоящее время границу между терминами «смартфон» и «коммуникатор» можно считать полностью стёртой. Хотя в мобильных телефонах практически всегда были дополнительные функции (калькулятор, календарь), со временем выпускались всё более и более интеллектуальные модели. С целью подчеркнуть возросшую функциональность и вычислительную мощность таких моделей был введён термин «смартфон». В эру роста популярности смартфонов они стали выпускаться с функциями мобильного телефона, такие устройства были названы коммуникаторами. В настоящее время разделение на смартфоны и коммуникаторы уже неактуально, сейчас оба термина обозначают одно и то же — миниатюрный универсальный компьютер с полноценными п

In [161]:
# Initialize regex tokenizer
tokenizer = RegexpTokenizer(r'\w+')

# Vectorize document using TF-IDF
tfidf = TfidfVectorizer(lowercase=True,
 #                       stop_words='russian',
                        ngram_range = (1,2),
                        tokenizer = tokenizer.tokenize)

# Fit and Transform the documents
train_data = tfidf.fit_transform(documents_list)   


In [185]:
train_data.toarray()
Df1 = pd.DataFrame(train_data.toarray())
Df1 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,790,791,792,793,794,795,796,797,798,799
0,0.075703,0.037851,0.037851,0.037851,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.05988,0.05988,0.05988,0.05988,0.05988,0.05988,...,0.0,0.0,0.05988,0.05988,0.0,0.0,0.05988,0.05988,0.05988,0.05988
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0753,0.0753,0.0,0.0,0.0753,0.0753,0.0,0.0,0.0,0.0


In [192]:
# Define the number of topics or components
num_components=2

# Create SVD object
lsa = TruncatedSVD(n_components=num_components)

# Fit SVD model on data
lsa.fit_transform(train_data)


# Get Singular values and Components 
Sigma = lsa.singular_values_ 
V_transpose = lsa.components_.T

In [193]:
lsa.components_
Df1 = pd.DataFrame(lsa.components_)
Df1 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,790,791,792,793,794,795,796,797,798,799
0,0.038819,0.019409,0.019409,0.019409,0.03086,0.03086,0.03086,0.03086,0.03086,0.03086,...,0.041419,0.041419,0.03086,0.03086,0.041419,0.041419,0.03086,0.03086,0.03086,0.03086
1,0.056974,0.028487,0.028487,0.028487,-0.043523,-0.043523,-0.043523,-0.043523,-0.043523,-0.043523,...,-0.001551,-0.001551,-0.043523,-0.043523,-0.001551,-0.001551,-0.043523,-0.043523,-0.043523,-0.043523


In [186]:
pd.DataFrame(V_transpose).to_excel("result_1.xlsx")

In [191]:
# Print the topics with their terms
terms = tfidf.get_feature_names_out()

for index, component in enumerate(lsa.components_):
    zipped = zip(terms, component)
    top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:8]
    top_terms_list=list(dict(top_terms_key).keys())
    print("Topic "+str(index)+": ",top_terms_list, top_terms_key)

Topic 0:  ['и', 'в', 'из', 'с', 'их', 'математика', 'свойств', 'мудрости'] [('и', 0.39993330435175145), ('в', 0.2265386983021158), ('из', 0.12425688450699683), ('с', 0.11645561334494067), ('их', 0.10191057193195178), ('математика', 0.09258119368138992), ('свойств', 0.09258119368138992), ('мудрости', 0.08283792300466458)]
Topic 1:  ['с', 'время', 'коммуникатор', 'мобильного', 'мобильного телефона', 'мобильных', 'смартфон', 'телефона'] [('с', 0.17092309012094464), ('время', 0.08546154506047232), ('коммуникатор', 0.08546154506047232), ('мобильного', 0.08546154506047232), ('мобильного телефона', 0.08546154506047232), ('мобильных', 0.08546154506047232), ('смартфон', 0.08546154506047232), ('телефона', 0.08546154506047232)]
Topic 2:  ['из', 'мудрости', 'обществом', 'философия', 'и', 'абстрактной', 'абстрактной из', 'буквально'] [('из', 0.1915821705394241), ('мудрости', 0.12772144702628277), ('обществом', 0.12772144702628277), ('философия', 0.12772144702628277), ('и', 0.06389509850599374), ('а

In [166]:
body = [
    'the quick brown fox',
    'the slow brown dog',
    'the quick red dog',
    'the lazy yellow fox'
]
vectorizer = TfidfVectorizer(use_idf=False, norm='l1')
bag_of_words = vectorizer.fit_transform(body)

svd = TruncatedSVD(n_components=2)
lsa = svd.fit_transform(bag_of_words)

topic_encoded_df = pd.DataFrame(lsa, index=['text_1', 'text_2', 'text_3', 'text_4'], columns=['topic_1', 'topic_2'])

In [167]:
topic_encoded_df

Unnamed: 0,topic_1,topic_2
text_1,0.423726,0.074881
text_2,0.378963,-0.192278
text_3,0.378963,-0.192278
text_4,0.316547,0.360146


In [172]:
feature_names = vectorizer.get_feature_names_out()

In [175]:
Df1 = pd.DataFrame(bag_of_words.toarray(), columns = feature_names)
Df1 

Unnamed: 0,brown,dog,fox,lazy,quick,red,slow,the,yellow
0,0.25,0.0,0.25,0.0,0.25,0.0,0.0,0.25,0.0
1,0.25,0.25,0.0,0.0,0.0,0.0,0.25,0.25,0.0
2,0.0,0.25,0.0,0.0,0.25,0.25,0.0,0.25,0.0
3,0.0,0.0,0.25,0.25,0.0,0.0,0.0,0.25,0.25
