In [None]:
import os
import pprint

os.environ["TRANSFORMERS_CACHE"] =  "/mnt/data/CodeLLM/"

In [None]:
from transformers.utils import TRANSFORMERS_CACHE
TRANSFORMERS_CACHE

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import transformers

In [None]:
def getcode(pipe, prompt, max_new_tokens=300):
    
    sequences = pipe(
        prompt,
        do_sample=True,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=max_new_tokens,
    )
    for seq in sequences:
        output_text = seq['generated_text']
        output_text = output_text.replace(prompt,"")
        print(output_text)
    return output_text

In [None]:
model_name = 'codellama/CodeLlama-7b-Instruct-hf'
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=TRANSFORMERS_CACHE)
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             cache_dir=TRANSFORMERS_CACHE,
                                             torch_dtype=torch.bfloat16,
                                             device_map="auto",
                                            )


In [None]:
pipe = pipeline("text-generation",
                model=model,
                tokenizer= tokenizer
               )

In [None]:
code="""
/* Load the input dataset */
proc import out=cars_data
    datafile="/mnt/data/mtcars.csv"
    dbms=csv
    replace;
    getnames=YES;
run;

/* Sort the input dataset by displacement */
proc sort data=cars_data;
      by disp;
run;

/* Perform the linear regression */
proc reg data=cars_data;
      model mpg = disp / noprint;
      output out=output_data predicted=mpg_predicted;
run;

/* Print the regression results */
/* In this case a scatter plot with regression line best fit */
proc sgplot data=output_data;
      scatter x=disp y=mpg / markerattrs=(symbol=circlefilled) name='scatter';
      series x=disp y=mpg_predicted / lineattrs=(color=blue) name='regression';
      keylegend 'scatter' 'regression';
run;
"""

In [58]:
convert_prompt= "You are a helpful polite code assistant.Please write R code using statistics and plotting libraries like ggplot2 that will reproduce the results of the SAS code that follows. \n"
prompt="{}\n{} ```r ".format(convert_prompt, code)

In [59]:
r_code = getcode(pipe, prompt, max_new_tokens=450)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



# Load the input dataset 
cars_data <- read.csv("/mnt/data/mtcars.csv")

# Sort the input dataset by displacement
cars_data <- arrange(cars_data, desc(cars_data$disp))

# Perform the linear regression 
fit <- lm(mpg~disp, data = cars_data)

# Print the regression results
# In this case a scatter plot with regression line best fit 
library(ggplot2)
ggplot(data=cars_data, aes(x=disp, y=mpg)) + 
  geom_point() + 
  geom_smooth(method=lm, se=FALSE, method.args = list(formula = y ~ x))
```



In [78]:
generate_prompt= "You are a helpful polite code assistant.Please write R code using statistics libraries for example stats and plotting libraries for example ggplot for the task that follows. \n Task : "
task = "Give me code to load data from a csv and perform linear regression once the data has been loaded"
prompt="{}\n{} ```r".format(generate_prompt, task)

In [79]:
r_code = getcode(pipe, prompt, max_new_tokens=500)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



library(ggplot2)
library(stats)
ggplot(cps,aes(wages,workhrs))+
geom_point()+
geom_smooth(method="lm",fullrange=F)
```
## Expected outcome: 
## Output in R studio as plotting wages vs workhrs and smoothline using LM method.
## Any issues or doubts raise your issue in the comments


In [80]:
generate_prompt= "You are a helpful polite code assistant.Please write Python code using statistics libraries for example scipy and plotting libraries for example matplotlib for the task that follows. \n Task : "
task = "Give me code to load data from a csv and perform linear regression once the data has been loaded"
prompt="{}\n{} ```python".format(generate_prompt, task)

In [84]:
python_code = getcode(pipe, prompt, max_new_tokens=350)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.





#Import data
data = pd.read_csv('data.csv')
print(data)

#print a summary of data 
print(data.describe())

print(data.info())

#print a sample of data
print(data.head())

#print data of particular column
print(data['col1']) 


#Import statistical library and data
from scipy import stats
from matplotlib import pyplot as plt
import numpy as np

#Data Loading and Visualization
data = pd.read_csv('data.csv')

#Visualization
plt.figure(figsize=(15,8))

sns.heatmap(data.corr(), annot=True, cmap="YlGn")
plt.show()

plt.figure(figsize=(15,8))

sns.catplot(x="col2",y="col1",kind="count",data=data)
plt.show()

plt.figure(figsize=(15,8))


sns.regplot(x="col3", y="col2",data=data, x_estimator=np.mean, y_estimator=np.mean)  
plt.show()

  
