# Extracting features from text

In [21]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import scipy
print(scipy.__version__)
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import numpy as np



1.12.0


In [22]:
import gensim.downloader as api

In [23]:
import nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Deea\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [24]:
def preprocess_text(text):
    # Lowercase
    text=text.lower()
    # Remove punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    #Tokenize
    words=text.split()
    # Remove stop words
    words = [word for word in words if word not in stop_words]
    # Stemming
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]
    return words

### Example of how the preprocessing works
text = "I really wanna see,  every step of the process of extracting features from text1233!!!"
words = preprocess_text(text)
--- Output
Lowering the text i really wanna see,  every step of the process of extracting features from text1233!!!
Removing the punctuation i really wanna see  every step of the process of extracting features from text1233
i really wanna see  every step of the process of extracting features from text
Words that were retrieved ['i', 'really', 'wanna', 'see', 'every', 'step', 'of', 'the', 'process', 'of', 'extracting', 'features', 'from', 'text']
Without stop words:  ['really', 'wanna', 'see', 'every', 'step', 'process', 'extracting', 'features', 'text']
After stemming:  ['realli', 'wanna', 'see', 'everi', 'step', 'process', 'extract', 'featur', 'text']



In [25]:
def get_processed_docs(texts):
    return [preprocess_text(text) for text in texts]

In [26]:
def get_vocabulary(texts):
    vocabulary = set()

    for doc in texts:
        vocabulary.update(preprocess_text(doc))

    return sorted(list(vocabulary))

In [27]:
def get_bow(texts):
    vocabulary = get_vocabulary(texts)
    vocab_index_map = {word: idx for idx, word in enumerate(vocabulary)}
    processed_texts = get_processed_docs(texts)
    bow_vectors = []
    for doc in processed_texts:
        vector = [0] * len(vocabulary)
        for word in doc:
            if word in vocab_index_map:
                vector[vocab_index_map[word]] += 1
        bow_vectors.append(vector)

    return bow_vectors

# Using Word2Vec

In [28]:
def get_word2VecGensimModel(texts):
    processed_texts = get_processed_docs(texts)
    model = Word2Vec(sentences=processed_texts, vector_size=100,window=5, min_count=1, workers=4)
    return model

In [29]:

def featureComputation(model,data):
    features=[]
    phrases=[phrase.split() for phrase in data]
    for phrase in phrases:
        vectors=[model[word] for word in phrase if (word in model.index_to_key)]
        if len(vectors)==0:
            result=[0]*model.vector_size
        else:
            result=np.sum(vectors,axis=0)/len(vectors)
        features.append(result)
    return features

In [30]:
def get_GoogleWord2VecModel():
    return api.load('word2vec-google-news-300')


In [31]:
# model=get_GoogleWord2VecModel()


In [32]:
from sklearn.cluster import KMeans
def get_KMeansSklearn(n_clusters,randomstates=0):
    return KMeans(n_clusters=n_clusters, random_state=randomstates)

In [33]:





import os
from azure.core.credentials import AzureKeyCredential
from azure.ai.textanalytics import TextAnalyticsClient

In [34]:
def get_client():
    endpoint = os.environ["LANGUAGE_ENDPOINT"]
    #print(endpoint)
    key = os.environ["LANGUAGE_KEYSEC"]
    #print(key)
    client=TextAnalyticsClient(endpoint=endpoint,credential=AzureKeyCredential(key))
    return client


In [35]:
#client=get_client()
#print(client)

In [36]:
def analyze_batch(batch,client):
    
    response = client.analyze_sentiment(batch,show_opinion_mining=True)
    return [doc for doc in response if not doc.is_error]
    


In [37]:
def get_sentiments(texts):
    client=get_client()
    batch_size=10
    batches=[texts[i:i+batch_size] for i in range(0,len(texts),batch_size)]
    results=[]
    for b in batches:
        b_res=analyze_batch(b,client)
        results.extend(b_res)
    return results,client

In [38]:
def show_sentiments(docs,text):
    for idx,doc in enumerate(docs):
        print(f"Document text:{text[idx]}")
        print(f"Overall sentiment:{doc.sentiment}")
        

In [39]:
# Hierical clustering

In [40]:
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from sklearn.feature_extraction.text import TfidfVectorizer

In [41]:
def get_tfidf(texts):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(texts)
    return X

In [42]:
def get_hierarchical_clusters(n_clusters,features):
    tfid_matrix=get_tfidf(features)
    model = AgglomerativeClustering(n_clusters=n_clusters, affinity="cosine", linkage="complete")
    labels=model.fit_predict(tfid_matrix.toarray())
    print(labels)
    linkage_matrix = linkage(tfid_matrix.toarray(), 'complete',metric='cosine')
    plt.figure(figsize=(10, 7))
    dendrogram(linkage_matrix,orientation='top',labels=[f"Doc {idx}" for idx in range(len(features))])
    plt.show()

In [43]:
def cluster_text(model, text, vectorizer):
    text_features = vectorizer.transform(text).toarray()
    centroids, labels = model.evaluate(text_features)
    return labels