# Read and explore the data

In [15]:
# Importing the pandas library and aliasing it as 'pd'
import pandas as pd

# Reading a CSV file named "Language Detection.csv" into a DataFrame
df = pd.read_csv("Language Detection.csv")

# Shuffling the rows of the DataFrame with a specific random state (42)
df = df.sample(frac=1, random_state=42)

# Resetting the index of the DataFrame and dropping the old index
df = df.reset_index(drop=True)

# Displaying the first few rows of the DataFrame
df.head()


Unnamed: 0,Text,Language
0,"И с этими словами она села в его карету, и, да...",Russian
1,Sistemi di tipo probabilistico erano invasi di...,Italian
2,Machine learning involves computers discoverin...,English
3,Несколько языковых версий опубликовали подборк...,Russian
4,"[225] In May 2014, Wikimedia Foundation named ...",English


In [18]:
# Count the occurrences of each language in the "Language" column
language_counts = df["Language"].value_counts()
print(language_counts)

English       1385
French        1014
Spanish        819
Portugeese     739
Italian        698
Russian        692
Sweedish       676
Malayalam      594
Dutch          546
Arabic         536
Turkish        474
German         470
Tamil          469
Danish         428
Kannada        369
Greek          365
Hindi           63
Name: Language, dtype: int64


# Data preparation

In [20]:
# Importing numpy for numerical operations
import numpy as np

# Importing CountVectorizer to create bag of word features
from sklearn.feature_extraction.text import CountVectorizer

# Importing train_test_split for splitting the data
from sklearn.model_selection import train_test_split

# Creating numpy arrays for the "Text" and "Language" columns
x = np.array(df["Text"])
y = np.array(df["Language"])

# Initializing a CountVectorizer to create bag of word features
cv = CountVectorizer()

# Transforming the text data into a sparse matrix of token counts
X = cv.fit_transform(x)

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=0)


# Modeling

In [21]:
# Import the Multinomial Naïve Bayes classifier from scikit-learn
from sklearn.naive_bayes import MultinomialNB

# Create an instance of the Multinomial Naïve Bayes model
model = MultinomialNB()

# Fit (train) the model using the training data
model.fit(X_train, y_train)


MultinomialNB()

In [25]:
# Calculate the accuracy score of the model on the test data
accuracy_score = model.score(X_test, y_test)

# Print the model's accuracy score
print(f"The model accuracy score is: {accuracy_score}")


The model accuracy score is: 0.9792069632495164


# Using the model

In [26]:
user = input("Enter a Text: ")
data = cv.transform([user]).toarray()
output = model.predict(data)
print(output)

## You can test on 
## ("Позже континенты воссоединились, образовав Паннотию, которая распалась около") this sentence,
## which is written in russian

Enter a Text: i love my mum
['English']


In [27]:
# Create a loop to continuously ask for user input
while True:
    user = input("Enter a Text (or 'end' to exit): ")  # Prompt the user to enter text
    
    # Check if the user wants to exit the loop
    if user.lower() == 'end':
        break  # Exit the loop if user enters 'end'

    data = cv.transform([user]).toarray()  # Transform user input into numerical data
    output = model.predict(data)  # Use the model to predict the language
    print(output)  # Print the predicted language



Enter a Text (or 'end' to exit): I love my mum
['English']
Enter a Text (or 'end' to exit): Позже континенты воссоединились, образовав Паннотию, которая распалась около
['Russian']
Enter a Text (or 'end' to exit): dcubcd
['English']
Enter a Text (or 'end' to exit): Buenas tardes
['Spanish']
Enter a Text (or 'end' to exit): Ich liebe dich
['German']
Enter a Text (or 'end' to exit): end
