# <center>Speech recognition & topic modeling relying on mic recordings<center>

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql.types import StructType, StructField, StringType, ArrayType
from pyspark.sql.types import ArrayType
from pyspark.ml.clustering import LDA
from pyspark.sql.functions import udf
from pyspark.ml.feature import CountVectorizer, Tokenizer, StopWordsRemover

In [2]:
spark = SparkSession.builder.appName("recordings").getOrCreate()

In [3]:
mic_recordings = spark.read.option("header", "true").option("inferSchema", "true").csv(r"C:\Users\Public\DW\___PSYCHOGRAPHICS\mic_recordings\mic_recordings.csv")

In [4]:
# quering for audio recording file paths
mic_recordings.createOrReplaceTempView("mic_recordings")
query = spark.sql("""SELECT path FROM mic_recordings""")

In [5]:
# extracting the paths from the query results
full_audios = query.select('path').rdd.flatMap(lambda x: x).collect()
# removing additional backslashes from the paths
full_audios = [path.replace('\\\\', '\\') for path in full_audios]

In [6]:
import speech_recognition as sr

r = sr.Recognizer() # creating a SpeechRecognizer instance

objs = [] # list to store speech recognition objects

# making speech recognition objects
for audio in full_audios:
    wav = sr.AudioFile(audio)
    with wav as source:
        objs.append(r.record(source))

In [7]:
text_file = []

# Using Google Speech Recognition to transcribe the speech
for i in range(len(objs)):
        text_file.append(r.recognize_google(objs[i]))

### Text pre-processing

In [8]:
# creating a DataFrame from the text chunks
documents = spark.createDataFrame([(document,) for document in text_file], ['value'])

In [9]:
# tokenization of the text
tokenizer = Tokenizer(inputCol='value', outputCol='tokens')
tokenized_df = tokenizer.transform(documents)

# removing stop words
stopwords = StopWordsRemover.loadDefaultStopWords('english')
remover = StopWordsRemover(inputCol='tokens', outputCol='filtered_tokens', stopWords=stopwords)
filtered_df = remover.transform(tokenized_df)

# creaating a vocabulary of unique words
cv = CountVectorizer(inputCol='filtered_tokens', outputCol='features')
cv_model = cv.fit(filtered_df)
vectorized_df = cv_model.transform(filtered_df)

### Latent Dirichlet Allocation (LDA) 

In [10]:
num_topics = 7  # number of topics to extract
max_iterations = 10  # maximum number of iterations for model training

# creating a LDA model
lda = LDA(k=num_topics, maxIter=max_iterations)

# fitting the LDA model
lda_model = lda.fit(vectorized_df)

In [11]:
# describing the topics
topics = lda_model.describeTopics()

In [12]:
# vocabulary of terms
vocab = cv_model.vocabulary

# defining a UDF to map term indices to terms
map_term_indices_udf = udf(lambda term_indices: [vocab[idx] for idx in term_indices], ArrayType(StringType()))

# adding a new column with the mapped terms
topics_with_terms = topics.withColumn("terms", map_term_indices_udf(topics.termIndices))

# showing the topics with their associated terms
topics_with_terms.select('topic','terms').show(truncate=False)

+-----+------------------------------------------------------------------------------------------------------------+
|topic|terms                                                                                                       |
+-----+------------------------------------------------------------------------------------------------------------+
|0    |[biology, experience, make, act, try, environmental, yes, mistake, time, full]                              |
|1    |[drugs, methamphetamine, pathetic, euphoric, question, cocaine, consumers, cannabis, psilocybin, altruistic]|
|2    |[know, good, need, time, important, things, well, saying, trust, slept]                                     |
|3    |[thing, sexual, performance, play, whether, case, differences, monoamine, experienced, chemically]          |
|4    |[experience, say, mind, really, experiences, strongly, body, people, like, environment]                     |
|5    |[impact, play, endogenous, eating, worked, controversial,