In [1]:
import pandas as pd
import numpy as np
import nltk


from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer
import re
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/kevinmelchert/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [2]:
data = pd.read_csv("../data/capstone.messages.csv")
data.head(2)

Unnamed: 0,_id,channel._id,channel.name,text,date
0,6579c6139c8c4517bf26e14e,65784dec2d93a25348d0ab85,cryptowhalesreal,I hope eveyone could get this fabulous profit ...,2023-12-12T09:40:12.000Z
1,6579c6149c8c4517bf26e14f,65784dec2d93a25348d0ab85,cryptowhalesreal,🎁 #FREE_SIGNAL \n\n🟣 Future Signal 🟣\n📈 BUY\n❇...,2023-12-11T12:39:33.000Z


In [3]:
# removing empty rows
data = data[data.text.notna()]
data.reset_index()

Unnamed: 0,index,_id,channel._id,channel.name,text,date
0,0,6579c6139c8c4517bf26e14e,65784dec2d93a25348d0ab85,cryptowhalesreal,I hope eveyone could get this fabulous profit ...,2023-12-12T09:40:12.000Z
1,1,6579c6149c8c4517bf26e14f,65784dec2d93a25348d0ab85,cryptowhalesreal,🎁 #FREE_SIGNAL \n\n🟣 Future Signal 🟣\n📈 BUY\n❇...,2023-12-11T12:39:33.000Z
2,2,6579c6149c8c4517bf26e150,65784dec2d93a25348d0ab85,cryptowhalesreal,Another 40% profit after the entry for a buy s...,2023-12-10T13:24:29.000Z
3,3,6579c6149c8c4517bf26e151,65784dec2d93a25348d0ab85,cryptowhalesreal,🟣 Future Signal 🟣\n📈 BUY\n❇️ #SOL - USDT\n🏢 Ex...,2023-12-10T13:24:20.000Z
4,4,6579c6149c8c4517bf26e152,65784dec2d93a25348d0ab85,cryptowhalesreal,This is the correct way of starting the weeken...,2023-12-09T07:54:00.000Z
...,...,...,...,...,...,...
5696,5778,6579c6b99c8c4517bf26f7e0,657982e5660a918f6a6e4b70,cryptotipstrick,⚡️⚡️ #WOO/USDT ⚡️⚡️\nExchanges: Binance Future...,2023-11-16T15:42:24.000Z
5697,5779,6579c6b99c8c4517bf26f7e1,657982e5660a918f6a6e4b70,cryptotipstrick,⚡️⚡️ #SNX/USDT ⚡️⚡️\nExchanges: Binance Future...,2023-11-16T15:42:09.000Z
5698,5780,6579c6b99c8c4517bf26f7e2,657982e5660a918f6a6e4b70,cryptotipstrick,⚡️⚡️ #RNDR/USDT ⚡️⚡️\nExchanges: Binance Futur...,2023-11-16T15:41:51.000Z
5699,5781,6579c6b99c8c4517bf26f7e3,657982e5660a918f6a6e4b70,cryptotipstrick,⚡️⚡️ #RNDR/USDT ⚡️⚡️\nExchanges: Binance Futur...,2023-11-16T15:41:50.000Z


In [4]:
# Correcting the function to effectively remove all hanging 'n's
def clean_text_final_corrected(text):
    # Removing line breaks and replacing with space
    text = text.replace('\n', ' ')
    # Removing any standalone 'n' characters that might have been left from '\n'
    text = re.sub(r'\b[n]\b', ' ', text)
    # Remaining cleaning steps
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub(r'[^a-zA-Z0-9%$]', ' ', text)
    text = re.sub(r'\s+', ' ', text)  # Removing extra space
    return text.lower()

# Reapplying the corrected text cleaning function
data['cleaned_text'] = data['text'].apply(clean_text_final_corrected)

# Displaying the first few rows of the updated cleaned data
data[['text', 'cleaned_text']].head(10)


Unnamed: 0,text,cleaned_text
0,I hope eveyone could get this fabulous profit ...,i hope eveyone could get this fabulous profit ...
1,🎁 #FREE_SIGNAL \n\n🟣 Future Signal 🟣\n📈 BUY\n❇...,free signal future signal buy tia usdt exchan...
2,Another 40% profit after the entry for a buy s...,another 40% profit after the entry for a buy s...
3,🟣 Future Signal 🟣\n📈 BUY\n❇️ #SOL - USDT\n🏢 Ex...,future signal buy sol usdt exchange all avail...
4,This is the correct way of starting the weeken...,this is the correct way of starting the weeken...
5,🟣 Future Signal 🟣\n📈 BUY\n❇️ #SXP - USDT\n🏢 Ex...,future signal buy sxp usdt exchange all avail...
6,Semms #YGG doen't have enough strength. We jus...,semms ygg doen t have enough strength we just ...
7,The price had a bearish shadow; don't worry 😮‍...,the price had a bearish shadow don t worry nti...
8,🎁 #FREE_SIGNAL \n\n🟣 Future Signal 🟣\n📈 BUY\n❇...,free signal future signal buy ygg usdt exchan...
9,We keep the Monday Blues away with profitable ...,we keep the monday blues away with profitable ...


In [5]:
# Manual tokenization function
def manual_tokenize(text):
    # Simple tokenization by splitting on spaces (this is less sophisticated than NLTK's word_tokenize)
    return text.split()

# Applying manual tokenization to the cleaned text
data['manual_tokens'] = data['cleaned_text'].apply(manual_tokenize)

# Displaying the first few rows with manual tokens
data[['cleaned_text', 'manual_tokens']].head()


Unnamed: 0,cleaned_text,manual_tokens
0,i hope eveyone could get this fabulous profit ...,"[i, hope, eveyone, could, get, this, fabulous,..."
1,free signal future signal buy tia usdt exchan...,"[free, signal, future, signal, buy, tia, usdt,..."
2,another 40% profit after the entry for a buy s...,"[another, 40%, profit, after, the, entry, for,..."
3,future signal buy sol usdt exchange all avail...,"[future, signal, buy, sol, usdt, exchange, all..."
4,this is the correct way of starting the weeken...,"[this, is, the, correct, way, of, starting, th..."


In [6]:
# Manually defining a basic list of English stopwords
basic_stopwords = set([
    "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", 
    "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", 
    "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", 
    "theirs", "themselves", "what", "which", "who", "whom", "this", "that", 
    "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", 
    "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", 
    "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", 
    "at", "by", "for", "with", "about", "against", "between", "into", "through", 
    "during", "before", "after", "above", "below", "to", "from", "up", "down", 
    "in", "out", "on", "off", "over", "under", "again", "further", "then", 
    "once", "here", "there", "when", "where", "why", "how", "all", "any", 
    "both", "each", "few", "more", "most", "other", "some", "such", "no", 
    "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", 
    "can", "will", "just", "don", "should", "now"
])

# Function to remove stop words from tokens using the manually defined list
def remove_manual_stop_words(tokens):
    return [word for word in tokens if word not in basic_stopwords]

# Applying the stop word removal function using the manual list
data['tokens_no_stopwords_manual'] = data['manual_tokens'].apply(remove_manual_stop_words)

# Displaying the first few rows with tokens without stop words
data[['manual_tokens', 'tokens_no_stopwords_manual']].head()


Unnamed: 0,manual_tokens,tokens_no_stopwords_manual
0,"[i, hope, eveyone, could, get, this, fabulous,...","[hope, eveyone, could, get, fabulous, profit, ..."
1,"[free, signal, future, signal, buy, tia, usdt,...","[free, signal, future, signal, buy, tia, usdt,..."
2,"[another, 40%, profit, after, the, entry, for,...","[another, 40%, profit, entry, buy, signal, cry..."
3,"[future, signal, buy, sol, usdt, exchange, all...","[future, signal, buy, sol, usdt, exchange, ava..."
4,"[this, is, the, correct, way, of, starting, th...","[correct, way, starting, weekend, ntouched, ta..."


In [7]:
# Simplified lemmatization function (without using WordNet)
# This function will perform a basic form of lemmatization based on simple heuristics

def simplified_lemmatize(word):
    # Basic rules for lemmatization (this is a very rudimentary approach)
    if word.endswith('ing'):
        return word[:-3]
    if word.endswith('ed'):
        return word[:-2]
    if word.endswith('s'):
        return word[:-1]
    return word

def apply_simplified_lemmatization(tokens):
    return [simplified_lemmatize(word) for word in tokens]

# Applying simplified lemmatization to the tokens
data['simplified_lemmatized_tokens'] = data['tokens_no_stopwords_manual'].apply(apply_simplified_lemmatization)

# Displaying the first few rows with simplified lemmatized tokens
data[['tokens_no_stopwords_manual', 'simplified_lemmatized_tokens']].head()


Unnamed: 0,tokens_no_stopwords_manual,simplified_lemmatized_tokens
0,"[hope, eveyone, could, get, fabulous, profit, ...","[hope, eveyone, could, get, fabulou, profit, f..."
1,"[free, signal, future, signal, buy, tia, usdt,...","[free, signal, future, signal, buy, tia, usdt,..."
2,"[another, 40%, profit, entry, buy, signal, cry...","[another, 40%, profit, entry, buy, signal, cry..."
3,"[future, signal, buy, sol, usdt, exchange, ava...","[future, signal, buy, sol, usdt, exchange, ava..."
4,"[correct, way, starting, weekend, ntouched, ta...","[correct, way, start, weekend, ntouch, target,..."


In [8]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Creating a CountVectorizer object for BoW
bow_vectorizer = CountVectorizer(max_features=1000)  # Limiting to top 1000 features for simplicity
bow_features = bow_vectorizer.fit_transform(data['cleaned_text']).toarray()

# Creating a TfidfVectorizer object for TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # Limiting to top 1000 features
tfidf_features = tfidf_vectorizer.fit_transform(data['cleaned_text']).toarray()

# Converting the features into DataFrames for better readability
bow_df = pd.DataFrame(bow_features, columns=bow_vectorizer.get_feature_names_out())
tfidf_df = pd.DataFrame(tfidf_features, columns=tfidf_vectorizer.get_feature_names_out())

# Displaying the first few rows of each DataFrame
bow_df.head(), tfidf_df.head()


(   00  000  06  0d  10  100  1000  10x  11  12  ...  xrp  year  years  yet  \
 0   0    0   0   0   0    0     0    0   0   0  ...    0     0      0    0   
 1   0    0   1   0   6    0     0    0   1   1  ...    0     0      0    0   
 2   0    0   0   0   0    0     0    0   0   0  ...    0     0      0    0   
 3   0    0   0   0   1    0     0    0   1   0  ...    0     0      0    0   
 4   0    0   0   0   0    0     0    0   0   0  ...    0     0      0    0   
 
    york  you  your  youtube  zero  zone  
 0     0    0     0        0     0     0  
 1     0    0     0        0     0     0  
 2     0    0     0        0     0     0  
 3     0    0     0        0     0     0  
 4     0    0     0        0     0     0  
 
 [5 rows x 1000 columns],
     00  000        06   0d        10  100  1000  10x        11        12  ...  \
 0  0.0  0.0  0.000000  0.0  0.000000  0.0   0.0  0.0  0.000000  0.000000  ...   
 1  0.0  0.0  0.154434  0.0  0.464922  0.0   0.0  0.0  0.124710  0.109533 

In [15]:

# Initializing VADER's SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

# Function to get the compound sentiment score
def get_sentiment(text):
    return sia.polarity_scores(text)['compound']

# Applying the sentiment analysis to the cleaned text
data['sentiment_score'] = data['cleaned_text'].apply(get_sentiment)

# Categorizing sentiment into positive, neutral, or negative based on the compound score
data['sentiment_category'] = data['sentiment_score'].apply(lambda score: 'positive' if score > 0.05 else ('negative' if score < -0.05 else 'neutral'))

# Displaying the first few rows with sentiment scores and categories
data[['cleaned_text', 'sentiment_score', 'sentiment_category']].sort_values('sentiment_score', ascending=False).head(20)


Unnamed: 0,cleaned_text,sentiment_score,sentiment_category
2874,the seed collector na new take on blockchain g...,0.9976,positive
2663,report korean regulator approves issuance and...,0.9973,positive
1802,lecksis is giving away $500 nare you ready to...,0.9971,positive
2081,83% confess attraction to crypto fanatics on ...,0.9968,positive
1532,asia based crypto exchange xrex secures in pri...,0.9967,positive
1594,welcome to the world of limitless possibiliti...,0.9961,positive
2509,join the festivities and win with duelbits $5...,0.9961,positive
1647,discover defexa wallet join now for a chance ...,0.9961,positive
2502,join the festivities and win with duelbits $5...,0.9961,positive
2088,join the metaverse movement with this high po...,0.9958,positive
